[llvm-branch-commits] [llvm] [amdgpu-cfi: 7/9]: [AMDGPU] Implement CFI for CSR spills (PR #183150)
Scott Linder via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri May 15 08:00:22 PDT 2026
https://github.com/slinder1 updated https://github.com/llvm/llvm-project/pull/183150
>From 297d4db7eb09f3496049224602f992ac59f69791 Mon Sep 17 00:00:00 2001
From: Emma Pilkington <Emma.Pilkington at amd.com>
Date: Wed, 25 Jun 2025 11:06:31 -0400
Subject: [PATCH] [AMDGPU] Implement CFI for CSR spills
Introduce new SPILL pseudos to allow CFI to be generated for only CSR
spills, and to make ISA-instruction-level accurate information.
Other targets either generate slightly incorrect information or rely on
conventions for how spills are placed within the entry block. The
approach in this change produces larger unwind tables, with the
increased size being spent on additional DW_CFA_advance_location
instructions needed to describe the unwinding accurately.
Change-Id: I9b09646abd2ac4e56eddf5e9aeca1a5bebbd43dd
Co-authored-by: Scott Linder <scott.linder at amd.com>
Co-authored-by: Venkata Ramanaiah Nalamothu <VenkataRamanaiah.Nalamothu at amd.com>
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 160 +-
llvm/lib/Target/AMDGPU/SIFrameLowering.h | 43 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 133 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 17 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 18 +
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 69 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 223 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 15 +-
.../CodeGen/AMDGPU/GlobalISel/assert-align.ll | 4 +-
.../GlobalISel/call-outgoing-stack-args.ll | 26 +-
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 8 +-
.../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 8 +-
.../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 8 +-
.../abi-attribute-hints-undefined-behavior.ll | 4 +-
.../AMDGPU/accvgpr-spill-scc-clobber.mir | 2712 +++++++-
.../AMDGPU/agpr-copy-no-free-registers.ll | 15 +-
.../CodeGen/AMDGPU/amdgcn-call-whole-wave.ll | 24 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 5696 ++++++++++-------
.../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 24 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 118 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll | 16 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll | 48 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 1222 ++--
.../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 426 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 454 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 415 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 453 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 570 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 793 ++-
.../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 1015 ++-
.../AMDGPU/amdgpu-cs-chain-preserve-cc.ll | 57 +-
.../amdgpu-simplify-libcall-pow-codegen.ll | 98 +-
...tor-flatscratchinit-undefined-behavior2.ll | 26 +-
.../AMDGPU/av_spill_cross_bb_usage.mir | 12 +
llvm/test/CodeGen/AMDGPU/bf16.ll | 326 +-
.../test/CodeGen/AMDGPU/branch-relax-spill.ll | 4 +-
.../CodeGen/AMDGPU/call-args-inreg-bfloat.ll | 20 +-
.../call-args-inreg-no-sgpr-for-csrspill.ll | 12 +-
llvm/test/CodeGen/AMDGPU/call-args-inreg.ll | 220 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 128 +-
.../AMDGPU/call-preserved-registers.ll | 28 +-
llvm/test/CodeGen/AMDGPU/call-skip.ll | 2 +-
.../test/CodeGen/AMDGPU/callee-frame-setup.ll | 114 +-
.../callee-special-input-vgprs-packed.ll | 46 +-
.../AMDGPU/callee-special-input-vgprs.ll | 44 +-
llvm/test/CodeGen/AMDGPU/cc-entry.ll | 7 +-
.../AMDGPU/cc-inreg-sgpr0-3-mismatch.ll | 8 +-
.../AMDGPU/copysign-simplify-demanded-bits.ll | 12 +-
.../AMDGPU/cross-block-use-is-not-abi-copy.ll | 16 +-
llvm/test/CodeGen/AMDGPU/debug-frame.ll | 498 +-
.../AMDGPU/dwarf-multi-register-use-crash.ll | 26 +-
.../dynamic-vgpr-reserve-stack-for-cwsr.ll | 14 +-
.../eliminate-frame-index-s-mov-b32.mir | 96 +
.../AMDGPU/eliminate-frame-index-select.ll | 32 +-
.../AMDGPU/eliminate-frame-index-select.mir | 1 +
.../fix-frame-reg-in-custom-csr-spills.ll | 8 +-
llvm/test/CodeGen/AMDGPU/frame-index.mir | 100 +-
...frame-setup-without-sgpr-to-vgpr-spills.ll | 4 +-
.../CodeGen/AMDGPU/function-args-inreg.ll | 30 +-
.../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll | 8 +-
.../AMDGPU/gfx-callable-argument-types.ll | 3687 ++++++-----
.../gfx-callable-preserved-registers.ll | 181 +-
.../AMDGPU/gfx-callable-return-types.ll | 248 +-
.../identical-subrange-spill-infloop.ll | 2 +-
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 5 +-
.../CodeGen/AMDGPU/insert-waitcnts-crash.ll | 7 +-
llvm/test/CodeGen/AMDGPU/issue176578.ll | 6 +-
....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 9 +-
...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 20 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 125 +-
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 151 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 58 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 58 +-
.../materialize-frame-index-sgpr.gfx10.ll | 100 +-
.../AMDGPU/materialize-frame-index-sgpr.ll | 169 +-
llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll | 48 +-
llvm/test/CodeGen/AMDGPU/maximumnum.ll | 124 +-
.../CodeGen/AMDGPU/memintrinsic-unroll.ll | 39 +-
.../AMDGPU/memset-param-combinations.ll | 37 +-
llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll | 50 +-
llvm/test/CodeGen/AMDGPU/minimumnum.ll | 122 +-
.../CodeGen/AMDGPU/mul24-pass-ordering.ll | 12 +-
llvm/test/CodeGen/AMDGPU/nested-calls.ll | 12 +-
.../AMDGPU/no-source-locations-in-prologue.ll | 1 +
llvm/test/CodeGen/AMDGPU/nofpclass-call.ll | 26 +-
.../AMDGPU/pei-amdgpu-cs-chain-preserve.mir | 1 +
.../AMDGPU/pei-vgpr-block-spill-csr.mir | 574 +-
.../AMDGPU/preserve-wwm-copy-dst-reg.ll | 14 +-
llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 11 +-
.../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir | 67 +
.../AMDGPU/shufflevector.v2i64.v8i64.ll | 325 +-
.../si-lower-sgpr-spills-vgpr-lanes-usage.mir | 3 +
.../CodeGen/AMDGPU/si-lower-sgpr-spills.mir | 89 +-
llvm/test/CodeGen/AMDGPU/sibling-call.ll | 34 +-
.../spill-partial-csr-sgpr-live-ins.mir | 5 +
.../AMDGPU/spill-sgpr-csr-live-ins.mir | 1 +
.../AMDGPU/spill-sgpr-to-virtual-vgpr.mir | 16 +
llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll | 16 +-
.../spill_more_than_wavesize_csr_sgprs.ll | 8 +-
llvm/test/CodeGen/AMDGPU/stack-realign.ll | 16 +-
.../CodeGen/AMDGPU/stacksave_stackrestore.ll | 12 +-
.../AMDGPU/strictfp_f16_abi_promote.ll | 40 +-
.../CodeGen/AMDGPU/swdev504645-global-fold.ll | 8 +-
.../tail-call-inreg-arguments.waterfall.ll | 14 +-
.../AMDGPU/tuple-allocation-failure.ll | 6 +-
...unfold-masked-merge-scalar-variablemask.ll | 30 +-
.../unspill-vgpr-after-rewrite-vgpr-mfma.ll | 18 +-
llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll | 22 +-
.../AMDGPU/vgpr-mark-last-scratch-load.ll | 85 +-
.../CodeGen/AMDGPU/vgpr-tuple-allocation.ll | 81 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 12 +-
.../CodeGen/AMDGPU/whole-wave-functions.ll | 99 +-
.../AMDGPU/whole-wave-register-copy.ll | 2 +-
.../AMDGPU/whole-wave-register-spill.ll | 2 +-
114 files changed, 14470 insertions(+), 9172 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 2c3231634cdaa..29e83b4fab2e3 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -2228,26 +2228,95 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
return true;
}
+/// Return the set of all root registers of regunits live-in to @p MBB.
+///
+/// Intended to avoid using the expensive @c MCRegAliasIterator when deciding
+/// if a register to be spilled is already live-in (see @c isAnyRootLiveIn).
+static SparseBitVector<> buildLiveInRoots(const MachineBasicBlock &MBB,
+ const SIRegisterInfo &TRI) {
+ SparseBitVector<> LiveInRoots;
+ for (const auto &LI : MBB.liveins()) {
+ for (MCRegUnitMaskIterator MI(LI.PhysReg, &TRI); MI.isValid(); ++MI) {
+ auto [Unit, UnitLaneMask] = *MI;
+ if ((LI.LaneMask & UnitLaneMask).none())
+ continue;
+ for (MCRegUnitRootIterator RI(Unit, &TRI); RI.isValid(); ++RI)
+ LiveInRoots.set(*RI);
+ }
+ }
+ return LiveInRoots;
+}
+
+/// Returns true iff any root of @p Reg is in @p LiveInRoots
+/// (see @c buildLiveInRoots).
+static bool isAnyRootLiveIn(const SparseBitVector<> &LiveInRoots,
+ const SIRegisterInfo &TRI, MCRegister Reg) {
+ for (MCRegUnitIterator UI(Reg, &TRI); UI.isValid(); ++UI) {
+ for (MCRegUnitRootIterator RI(*UI, &TRI); RI.isValid(); ++RI) {
+ if (LiveInRoots.test(*RI))
+ return true;
+ }
+ }
+ return false;
+}
+
+void SIFrameLowering::spillCalleeSavedRegisterWithoutBlockOps(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const CalleeSavedInfo &CS, const SIInstrInfo *TII,
+ const SIRegisterInfo &TRI,
+ const std::optional<SparseBitVector<>> &LiveInRoots) const {
+ MCRegister Reg = CS.getReg();
+
+ // We assume a sortUniqueLiveIns later
+ MBB.addLiveIn(Reg);
+
+ if (CS.isSpilledToReg()) {
+ BuildMI(MBB, MI, DebugLoc(), TII->get(TargetOpcode::COPY), CS.getDstReg())
+ .addReg(Reg, getKillRegState(true));
+ } else {
+ const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+ bool IsKill = true;
+ // If this value was already livein, we probably have a direct use of
+ // the incoming register value, so don't kill at the spill point. This
+ // happens since we pass some special inputs (workgroup IDs) in the
+ // callee saved range.
+ if (LiveInRoots)
+ IsKill = !isAnyRootLiveIn(*LiveInRoots, TRI, Reg);
+ TII->storeRegToStackSlotCFI(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC);
+ }
+}
+
bool SIFrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *OrigTRI) const {
+ auto &TRI = *static_cast<const SIRegisterInfo *>(OrigTRI);
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- if (!ST.useVGPRBlockOpsForCSR())
- return false;
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ std::optional<SparseBitVector<>> LiveInRoots;
+ if (MBB.getParent()->getRegInfo().tracksLiveness())
+ LiveInRoots = buildLiveInRoots(MBB, TRI);
+
+ if (!ST.useVGPRBlockOpsForCSR()) {
+ for (const CalleeSavedInfo &CS : CSI)
+ spillCalleeSavedRegisterWithoutBlockOps(MBB, MI, CS, TII, TRI,
+ LiveInRoots);
+ if (LiveInRoots)
+ MBB.sortUniqueLiveIns();
+ return true;
+ }
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const SIInstrInfo *TII = ST.getInstrInfo();
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
- const TargetRegisterClass *BlockRegClass =
- static_cast<const SIRegisterInfo *>(TRI)->getRegClassForBlockOp(*MF);
+ const TargetRegisterClass *BlockRegClass = TRI.getRegClassForBlockOp(*MF);
for (const CalleeSavedInfo &CS : CSI) {
Register Reg = CS.getReg();
if (!BlockRegClass->contains(Reg) ||
!FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
- spillCalleeSavedRegister(MBB, MI, CS, TII, TRI);
+ spillCalleeSavedRegisterWithoutBlockOps(MBB, MI, CS, TII, TRI,
+ LiveInRoots);
continue;
}
@@ -2262,10 +2331,10 @@ bool SIFrameLowering::spillCalleeSavedRegisters(
FrameInfo.getObjectAlign(FrameIndex));
BuildMI(MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
+ TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE))
.addReg(Reg, getKillRegState(false))
.addFrameIndex(FrameIndex)
- .addReg(MFI->getStackPtrOffsetReg())
+ .addReg(FuncInfo->getStackPtrOffsetReg())
.addImm(0)
.addImm(Mask)
.addMemOperand(MMO);
@@ -2276,16 +2345,20 @@ bool SIFrameLowering::spillCalleeSavedRegisters(
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
// then the whole block will be marked as reserved and `updateLiveness` will
// skip it.
- MBB.addLiveIn(Reg);
+ if (LiveInRoots)
+ MBB.addLiveIn(Reg);
}
- MBB.sortUniqueLiveIns();
+ if (LiveInRoots)
+ MBB.sortUniqueLiveIns();
return true;
}
bool SIFrameLowering::restoreCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+ MutableArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *OrigTRI) const {
+ auto &TRI = *static_cast<const SIRegisterInfo *>(OrigTRI);
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
if (!ST.useVGPRBlockOpsForCSR())
@@ -2294,13 +2367,12 @@ bool SIFrameLowering::restoreCalleeSavedRegisters(
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &MFI = MF->getFrameInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
- const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(*MF);
+ const TargetRegisterClass *BlockRegClass = TRI.getRegClassForBlockOp(*MF);
for (const CalleeSavedInfo &CS : reverse(CSI)) {
Register Reg = CS.getReg();
if (!BlockRegClass->contains(Reg) ||
!FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
- restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI);
+ restoreCalleeSavedRegister(MBB, MI, CS, TII, &TRI);
continue;
}
@@ -2320,7 +2392,7 @@ bool SIFrameLowering::restoreCalleeSavedRegisters(
.addImm(0)
.addImm(Mask)
.addMemOperand(MMO);
- SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg);
+ TRI.addImplicitUsesForBlockCSRLoad(MIB, Reg);
// Add the register to the liveins. This is necessary because if any of the
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
@@ -2447,6 +2519,22 @@ MachineInstr *SIFrameLowering::buildCFI(MachineBasicBlock &MBB,
.setMIFlag(Flag);
}
+MachineInstr *SIFrameLowering::buildCFIForVRegToVRegSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const MCRegister Reg, const MCRegister RegCopy) const {
+ MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ MCRegister MaskReg = MCRI.getDwarfRegNum(
+ ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, false);
+ auto CFIInst = MCCFIInstruction::createLLVMVectorRegisterMask(
+ nullptr, MCRI.getDwarfRegNum(Reg, false),
+ MCRI.getDwarfRegNum(RegCopy, false), VGPRLaneBitSize, MaskReg,
+ ST.getWavefrontSize());
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
+
MachineInstr *SIFrameLowering::buildCFIForSGPRToVGPRSpill(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const MCRegister SGPR, const MCRegister VGPR,
@@ -2498,6 +2586,34 @@ MachineInstr *SIFrameLowering::buildCFIForSGPRToVGPRSpill(
return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
}
+MachineInstr *SIFrameLowering::buildCFIForSGPRToVMEMSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, MCRegister SGPR, int64_t Offset) const {
+ MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+ return buildCFI(MBB, MBBI, DL,
+ llvm::MCCFIInstruction::createOffset(
+ nullptr, MCRI.getDwarfRegNum(SGPR, false), Offset));
+}
+
+MachineInstr *SIFrameLowering::buildCFIForVGPRToVMEMSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, MCRegister VGPR, int64_t Offset) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false);
+ assert(DwarfVGPR != -1);
+
+ MCRegister MaskReg = MCRI.getDwarfRegNum(
+ ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, false);
+ auto CFIInst = MCCFIInstruction::createLLVMVectorOffset(
+ nullptr, DwarfVGPR, VGPRLaneBitSize, MaskReg, ST.getWavefrontSize(),
+ Offset);
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
+
MachineInstr *SIFrameLowering::buildCFIForRegToSGPRPairSpill(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const MCRegister Reg, const MCRegister SGPRPair) const {
@@ -2517,3 +2633,13 @@ MachineInstr *SIFrameLowering::buildCFIForRegToSGPRPairSpill(
nullptr, DwarfReg, DwarfSGPR0, SGPRBitSize, DwarfSGPR1, SGPRBitSize);
return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
}
+
+MachineInstr *SIFrameLowering::buildCFIForSameValue(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, MCRegister Reg) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+ int DwarfReg = MCRI.getDwarfRegNum(Reg, /*isEH=*/false);
+ auto CFIInst = MCCFIInstruction::createSameValue(nullptr, DwarfReg);
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 0ba252db5f5e7..38bf3fee0cf7b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -14,6 +14,8 @@
namespace llvm {
+class SIInstrInfo;
+
class SIFrameLowering final : public AMDGPUFrameLowering {
public:
SIFrameLowering(StackDirection D, Align StackAl, int LAO,
@@ -55,6 +57,25 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const;
+private:
+ /// Spill a single CSR according to @p CS
+ ///
+ /// This is a separate method so it an be shared between the block-ops enabled
+ /// and disabled paths. Even when block-ops are enabled we may not have a
+ /// viable block for a specific register, so it will fall back to this
+ /// implementation.
+ ///
+ /// @p LiveInRoots conveys whether we are tracking liveness, and if we are
+ /// it captures the original live-ins before spilling in a way that can be
+ /// (relatively) efficiently checked without enumerating all register aliases.
+ /// See @c buildLiveInRoots in the implementation.
+ void spillCalleeSavedRegisterWithoutBlockOps(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const CalleeSavedInfo &CS, const SIInstrInfo *TII,
+ const SIRegisterInfo &TRI,
+ const std::optional<SparseBitVector<>> &LiveInRoots) const;
+
+public:
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI,
@@ -119,6 +140,13 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
const DebugLoc &DL, const MCCFIInstruction &CFIInst,
MachineInstr::MIFlag flag = MachineInstr::FrameSetup) const;
+ /// Create a CFI index describing a spill of the VGPR/AGPR \p Reg to another
+ /// VGPR/AGPR \p RegCopy and build a MachineInstr around it.
+ MachineInstr *buildCFIForVRegToVRegSpill(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ const MCRegister Reg,
+ const MCRegister RegCopy) const;
/// Create a CFI index describing a spill of an SGPR to a single lane of
/// a VGPR and build a MachineInstr around it.
MachineInstr *buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB,
@@ -133,11 +161,26 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, MCRegister SGPR,
ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills) const;
+ /// Create a CFI index describing a spill of a SGPR to VMEM and
+ /// build a MachineInstr around it.
+ MachineInstr *buildCFIForSGPRToVMEMSpill(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, MCRegister SGPR,
+ int64_t Offset) const;
+ /// Create a CFI index describing a spill of a VGPR to VMEM and
+ /// build a MachineInstr around it.
+ MachineInstr *buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, MCRegister VGPR,
+ int64_t Offset) const;
MachineInstr *buildCFIForRegToSGPRPairSpill(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,
MCRegister Reg,
MCRegister SGPRPair) const;
+ MachineInstr *buildCFIForSameValue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, MCRegister Reg) const;
// Returns true if the function may need to reserve space on the stack for the
// CWSR trap handler.
bool mayReserveScratchForCWSR(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ea86e22a0e807..2d08f9b1cb473 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1557,22 +1557,26 @@ SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
}
-static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
+static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
switch (Size) {
case 4:
- return AMDGPU::SI_SPILL_S32_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE;
case 8:
- return AMDGPU::SI_SPILL_S64_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE;
case 12:
- return AMDGPU::SI_SPILL_S96_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE;
case 16:
- return AMDGPU::SI_SPILL_S128_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE
+ : AMDGPU::SI_SPILL_S128_SAVE;
case 20:
- return AMDGPU::SI_SPILL_S160_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE
+ : AMDGPU::SI_SPILL_S160_SAVE;
case 24:
- return AMDGPU::SI_SPILL_S192_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE
+ : AMDGPU::SI_SPILL_S192_SAVE;
case 28:
- return AMDGPU::SI_SPILL_S224_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S224_CFI_SAVE
+ : AMDGPU::SI_SPILL_S224_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 36:
@@ -1584,69 +1588,90 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
case 48:
return AMDGPU::SI_SPILL_S384_SAVE;
case 64:
- return AMDGPU::SI_SPILL_S512_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE
+ : AMDGPU::SI_SPILL_S512_SAVE;
case 128:
- return AMDGPU::SI_SPILL_S1024_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE
+ : AMDGPU::SI_SPILL_S1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
}
-static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
+static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
switch (Size) {
case 2:
return AMDGPU::SI_SPILL_V16_SAVE;
case 4:
- return AMDGPU::SI_SPILL_V32_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE;
case 8:
- return AMDGPU::SI_SPILL_V64_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE;
case 12:
- return AMDGPU::SI_SPILL_V96_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE;
case 16:
- return AMDGPU::SI_SPILL_V128_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE
+ : AMDGPU::SI_SPILL_V128_SAVE;
case 20:
- return AMDGPU::SI_SPILL_V160_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE
+ : AMDGPU::SI_SPILL_V160_SAVE;
case 24:
- return AMDGPU::SI_SPILL_V192_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE
+ : AMDGPU::SI_SPILL_V192_SAVE;
case 28:
- return AMDGPU::SI_SPILL_V224_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V224_CFI_SAVE
+ : AMDGPU::SI_SPILL_V224_SAVE;
case 32:
- return AMDGPU::SI_SPILL_V256_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE
+ : AMDGPU::SI_SPILL_V256_SAVE;
case 36:
- return AMDGPU::SI_SPILL_V288_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V288_CFI_SAVE
+ : AMDGPU::SI_SPILL_V288_SAVE;
case 40:
- return AMDGPU::SI_SPILL_V320_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V320_CFI_SAVE
+ : AMDGPU::SI_SPILL_V320_SAVE;
case 44:
- return AMDGPU::SI_SPILL_V352_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V352_CFI_SAVE
+ : AMDGPU::SI_SPILL_V352_SAVE;
case 48:
- return AMDGPU::SI_SPILL_V384_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V384_CFI_SAVE
+ : AMDGPU::SI_SPILL_V384_SAVE;
case 64:
- return AMDGPU::SI_SPILL_V512_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE
+ : AMDGPU::SI_SPILL_V512_SAVE;
case 128:
- return AMDGPU::SI_SPILL_V1024_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE
+ : AMDGPU::SI_SPILL_V1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
}
-static unsigned getAVSpillSaveOpcode(unsigned Size) {
+static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
switch (Size) {
case 4:
- return AMDGPU::SI_SPILL_AV32_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV32_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV32_SAVE;
case 8:
- return AMDGPU::SI_SPILL_AV64_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV64_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV64_SAVE;
case 12:
- return AMDGPU::SI_SPILL_AV96_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV96_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV96_SAVE;
case 16:
- return AMDGPU::SI_SPILL_AV128_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV128_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV128_SAVE;
case 20:
- return AMDGPU::SI_SPILL_AV160_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV160_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV160_SAVE;
case 24:
- return AMDGPU::SI_SPILL_AV192_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV192_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV192_SAVE;
case 28:
- return AMDGPU::SI_SPILL_AV224_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV224_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV224_SAVE;
case 32:
- return AMDGPU::SI_SPILL_AV256_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV256_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV256_SAVE;
case 36:
return AMDGPU::SI_SPILL_AV288_SAVE;
case 40:
@@ -1656,9 +1681,11 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) {
case 48:
return AMDGPU::SI_SPILL_AV384_SAVE;
case 64:
- return AMDGPU::SI_SPILL_AV512_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV512_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV512_SAVE;
case 128:
- return AMDGPU::SI_SPILL_AV1024_SAVE;
+ return NeedsCFI ? AMDGPU::SI_SPILL_AV1024_CFI_SAVE
+ : AMDGPU::SI_SPILL_AV1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
@@ -1678,7 +1705,7 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
Register Reg, const TargetRegisterClass *RC, unsigned Size,
- const SIMachineFunctionInfo &MFI) const {
+ const SIMachineFunctionInfo &MFI, bool NeedsCFI) const {
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
// Choose the right opcode if spilling a WWM register.
@@ -1687,15 +1714,15 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
// TODO: Check if AGPRs are available
if (ST.hasMAIInsts())
- return getAVSpillSaveOpcode(Size);
+ return getAVSpillSaveOpcode(Size, NeedsCFI);
- return getVGPRSpillSaveOpcode(Size);
+ return getVGPRSpillSaveOpcode(Size, NeedsCFI);
}
-void SIInstrInfo::storeRegToStackSlot(
+void SIInstrInfo::storeRegToStackSlotImpl(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
- MachineInstr::MIFlag Flags) const {
+ MachineInstr::MIFlag Flags, bool NeedsCFI) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
@@ -1717,7 +1744,8 @@ void SIInstrInfo::storeRegToStackSlot(
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for spilling SGPRs.
- const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
+ const MCInstrDesc &OpDesc =
+ get(getSGPRSpillSaveOpcode(SpillSize, NeedsCFI));
// The SGPR spill/restore instructions only work on number sgprs, so we need
// to make sure we are using the correct register class.
@@ -1736,8 +1764,8 @@ void SIInstrInfo::storeRegToStackSlot(
return;
}
- unsigned Opcode =
- getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
+ unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
+ SpillSize, *MFI, NeedsCFI);
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
@@ -1748,6 +1776,23 @@ void SIInstrInfo::storeRegToStackSlot(
.addMemOperand(MMO);
}
+void SIInstrInfo::storeRegToStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
+ bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
+ MachineInstr::MIFlag Flags) const {
+ storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, VReg, Flags,
+ false);
+}
+
+void SIInstrInfo::storeRegToStackSlotCFI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ Register SrcReg, bool isKill,
+ int FrameIndex,
+ const TargetRegisterClass *RC) const {
+ storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, Register(),
+ MachineInstr::NoFlags, true);
+}
+
static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 7c081fbba8a33..453bd9840a360 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -311,6 +311,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineBasicBlock::iterator I, const DebugLoc &DL,
Register SrcReg, int Value) const;
+private:
+ void storeRegToStackSlotImpl(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, Register SrcReg,
+ bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC, Register VReg,
+ MachineInstr::MIFlag Flags, bool NeedsCFI) const;
+
+public:
+ void storeRegToStackSlotCFI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, Register SrcReg,
+ bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC) const;
+
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
int64_t &ImmVal) const override;
@@ -319,7 +332,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned getVectorRegSpillSaveOpcode(Register Reg,
const TargetRegisterClass *RC,
unsigned Size,
- const SIMachineFunctionInfo &MFI) const;
+ const SIMachineFunctionInfo &MFI,
+ bool NeedsCFI) const;
unsigned
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
unsigned Size,
@@ -732,6 +746,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
static bool isBlockLoadStore(uint32_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
+ case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
case AMDGPU::SCRATCH_STORE_BLOCK_SADDR:
case AMDGPU::SCRATCH_LOAD_BLOCK_SADDR:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2fad451ad2a74..a9580394b8f4b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1156,6 +1156,11 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
let mayLoad = 0;
}
+ def _CFI_SAVE : PseudoInstSI<(outs), (ins sgpr_class:$data, i32imm:$addr)> {
+ let mayStore = 1;
+ let mayLoad = 0;
+ }
+
def _RESTORE : PseudoInstSI <
(outs sgpr_class:$data),
(ins i32imm:$addr)> {
@@ -1229,6 +1234,19 @@ multiclass SI_SPILL_VGPR <SIRegisterClassLike vgpr_class,
let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
+ def _CFI_SAVE
+ : VPseudoInstSI<(outs),
+ !con((ins vgpr_class:$vdata, i32imm:$vaddr,
+ SReg_32:$soffset, i32imm:$offset),
+ !if(HasMask, (ins SReg_32:$mask), (ins)))> {
+ let mayStore = 1;
+ let mayLoad = 0;
+ // (2 * 4) + (8 * num_subregs) bytes maximum
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+ }
+
def _RESTORE : VPseudoInstSI <
(outs vgpr_class:$vdata),
!con(
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 0ffb7456af8cc..33b9c766e225e 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -119,62 +119,25 @@ INITIALIZE_PASS_END(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
char &llvm::SILowerSGPRSpillsLegacyID = SILowerSGPRSpillsLegacy::ID;
-static bool isLiveIntoMBB(MCRegister Reg, MachineBasicBlock &MBB,
- const TargetRegisterInfo *TRI) {
- for (MCRegAliasIterator R(Reg, TRI, true); R.isValid(); ++R) {
- if (MBB.isLiveIn(*R)) {
- return true;
- }
- }
- return false;
-}
-
/// Insert spill code for the callee-saved registers used in the function.
-static void insertCSRSaves(MachineBasicBlock &SaveBlock,
+static void insertCSRSaves(const GCNSubtarget &ST, MachineBasicBlock &SaveBlock,
ArrayRef<CalleeSavedInfo> CSI, SlotIndexes *Indexes,
LiveIntervals *LIS) {
- MachineFunction &MF = *SaveBlock.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *RI = ST.getRegisterInfo();
-
+ const TargetFrameLowering *TFI = ST.getFrameLowering();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
MachineBasicBlock::iterator I = SaveBlock.begin();
- if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) {
- for (const CalleeSavedInfo &CS : CSI) {
- // Insert the spill to the stack frame.
- MCRegister Reg = CS.getReg();
-
- MachineInstrSpan MIS(I, &SaveBlock);
- const TargetRegisterClass *RC = RI->getMinimalPhysRegClass(
- Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
-
- // If this value was already livein, we probably have a direct use of the
- // incoming register value, so don't kill at the spill point. This happens
- // since we pass some special inputs (workgroup IDs) in the callee saved
- // range.
- const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI);
- TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(),
- RC, Register());
-
- if (Indexes) {
- assert(std::distance(MIS.begin(), I) == 1);
- MachineInstr &Inst = *std::prev(I);
- Indexes->insertMachineInstrInMaps(Inst);
- }
-
- if (LIS)
- LIS->removeAllRegUnitsForPhysReg(Reg);
- }
- } else {
- // TFI doesn't update Indexes and LIS, so we have to do it separately.
- if (Indexes)
- Indexes->repairIndexesInRange(&SaveBlock, SaveBlock.begin(), I);
-
- if (LIS)
- for (const CalleeSavedInfo &CS : CSI)
- LIS->removeAllRegUnitsForPhysReg(CS.getReg());
- }
+ MachineInstrSpan MIS(I, &SaveBlock);
+ bool Success = TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI);
+ assert(Success && "spillCalleeSavedRegisters should always succeed");
+ (void)Success;
+
+ // TFI doesn't update Indexes and LIS, so we have to do it separately.
+ if (Indexes)
+ Indexes->repairIndexesInRange(&SaveBlock, SaveBlock.begin(), I);
+
+ if (LIS)
+ for (const CalleeSavedInfo &CS : CSI)
+ LIS->removeAllRegUnitsForPhysReg(CS.getReg());
}
/// Insert restore code for the callee-saved registers used in the function.
@@ -324,7 +287,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
if (!CSI.empty()) {
for (MachineBasicBlock *SaveBlock : SaveBlocks)
- insertCSRSaves(*SaveBlock, CSI, Indexes, LIS);
+ insertCSRSaves(ST, *SaveBlock, CSI, Indexes, LIS);
// Add live ins to save blocks.
assert(SaveBlocks.size() == 1 && "shrink wrapping not fully implemented");
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d89fefa550f0e..645d225ce578f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1131,6 +1131,7 @@ static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI,
unsigned Op = MI.getOpcode();
switch (Op) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
+ case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
// FIXME: This assumes the mask is statically known and not computed at
// runtime. However, some ABIs may want to compute the mask dynamically and
@@ -1138,21 +1139,29 @@ static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI,
return llvm::popcount(
(uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
case AMDGPU::SI_SPILL_S1024_SAVE:
+ case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_V1024_SAVE:
+ case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
case AMDGPU::SI_SPILL_V1024_RESTORE:
case AMDGPU::SI_SPILL_A1024_SAVE:
+ case AMDGPU::SI_SPILL_A1024_CFI_SAVE:
case AMDGPU::SI_SPILL_A1024_RESTORE:
case AMDGPU::SI_SPILL_AV1024_SAVE:
+ case AMDGPU::SI_SPILL_AV1024_CFI_SAVE:
case AMDGPU::SI_SPILL_AV1024_RESTORE:
return 32;
case AMDGPU::SI_SPILL_S512_SAVE:
+ case AMDGPU::SI_SPILL_S512_CFI_SAVE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V512_CFI_SAVE:
case AMDGPU::SI_SPILL_V512_RESTORE:
case AMDGPU::SI_SPILL_A512_SAVE:
+ case AMDGPU::SI_SPILL_A512_CFI_SAVE:
case AMDGPU::SI_SPILL_A512_RESTORE:
case AMDGPU::SI_SPILL_AV512_SAVE:
+ case AMDGPU::SI_SPILL_AV512_CFI_SAVE:
case AMDGPU::SI_SPILL_AV512_RESTORE:
return 16;
case AMDGPU::SI_SPILL_S384_SAVE:
@@ -1192,75 +1201,107 @@ static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI,
case AMDGPU::SI_SPILL_AV288_RESTORE:
return 9;
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S256_CFI_SAVE:
case AMDGPU::SI_SPILL_S256_RESTORE:
case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V256_CFI_SAVE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_A256_SAVE:
+ case AMDGPU::SI_SPILL_A256_CFI_SAVE:
case AMDGPU::SI_SPILL_A256_RESTORE:
case AMDGPU::SI_SPILL_AV256_SAVE:
+ case AMDGPU::SI_SPILL_AV256_CFI_SAVE:
case AMDGPU::SI_SPILL_AV256_RESTORE:
return 8;
case AMDGPU::SI_SPILL_S224_SAVE:
+ case AMDGPU::SI_SPILL_S224_CFI_SAVE:
case AMDGPU::SI_SPILL_S224_RESTORE:
case AMDGPU::SI_SPILL_V224_SAVE:
+ case AMDGPU::SI_SPILL_V224_CFI_SAVE:
case AMDGPU::SI_SPILL_V224_RESTORE:
case AMDGPU::SI_SPILL_A224_SAVE:
+ case AMDGPU::SI_SPILL_A224_CFI_SAVE:
case AMDGPU::SI_SPILL_A224_RESTORE:
case AMDGPU::SI_SPILL_AV224_SAVE:
+ case AMDGPU::SI_SPILL_AV224_CFI_SAVE:
case AMDGPU::SI_SPILL_AV224_RESTORE:
return 7;
case AMDGPU::SI_SPILL_S192_SAVE:
+ case AMDGPU::SI_SPILL_S192_CFI_SAVE:
case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_V192_SAVE:
+ case AMDGPU::SI_SPILL_V192_CFI_SAVE:
case AMDGPU::SI_SPILL_V192_RESTORE:
case AMDGPU::SI_SPILL_A192_SAVE:
+ case AMDGPU::SI_SPILL_A192_CFI_SAVE:
case AMDGPU::SI_SPILL_A192_RESTORE:
case AMDGPU::SI_SPILL_AV192_SAVE:
+ case AMDGPU::SI_SPILL_AV192_CFI_SAVE:
case AMDGPU::SI_SPILL_AV192_RESTORE:
return 6;
case AMDGPU::SI_SPILL_S160_SAVE:
+ case AMDGPU::SI_SPILL_S160_CFI_SAVE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_V160_SAVE:
+ case AMDGPU::SI_SPILL_V160_CFI_SAVE:
case AMDGPU::SI_SPILL_V160_RESTORE:
case AMDGPU::SI_SPILL_A160_SAVE:
+ case AMDGPU::SI_SPILL_A160_CFI_SAVE:
case AMDGPU::SI_SPILL_A160_RESTORE:
case AMDGPU::SI_SPILL_AV160_SAVE:
+ case AMDGPU::SI_SPILL_AV160_CFI_SAVE:
case AMDGPU::SI_SPILL_AV160_RESTORE:
return 5;
case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S128_CFI_SAVE:
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V128_CFI_SAVE:
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_A128_SAVE:
+ case AMDGPU::SI_SPILL_A128_CFI_SAVE:
case AMDGPU::SI_SPILL_A128_RESTORE:
case AMDGPU::SI_SPILL_AV128_SAVE:
+ case AMDGPU::SI_SPILL_AV128_CFI_SAVE:
case AMDGPU::SI_SPILL_AV128_RESTORE:
return 4;
case AMDGPU::SI_SPILL_S96_SAVE:
+ case AMDGPU::SI_SPILL_S96_CFI_SAVE:
case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V96_CFI_SAVE:
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_A96_SAVE:
+ case AMDGPU::SI_SPILL_A96_CFI_SAVE:
case AMDGPU::SI_SPILL_A96_RESTORE:
case AMDGPU::SI_SPILL_AV96_SAVE:
+ case AMDGPU::SI_SPILL_AV96_CFI_SAVE:
case AMDGPU::SI_SPILL_AV96_RESTORE:
return 3;
case AMDGPU::SI_SPILL_S64_SAVE:
+ case AMDGPU::SI_SPILL_S64_CFI_SAVE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V64_CFI_SAVE:
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_A64_SAVE:
+ case AMDGPU::SI_SPILL_A64_CFI_SAVE:
case AMDGPU::SI_SPILL_A64_RESTORE:
case AMDGPU::SI_SPILL_AV64_SAVE:
+ case AMDGPU::SI_SPILL_AV64_CFI_SAVE:
case AMDGPU::SI_SPILL_AV64_RESTORE:
return 2;
case AMDGPU::SI_SPILL_S32_SAVE:
+ case AMDGPU::SI_SPILL_S32_CFI_SAVE:
case AMDGPU::SI_SPILL_S32_RESTORE:
case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_CFI_SAVE:
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_A32_SAVE:
+ case AMDGPU::SI_SPILL_A32_CFI_SAVE:
case AMDGPU::SI_SPILL_A32_RESTORE:
case AMDGPU::SI_SPILL_AV32_SAVE:
+ case AMDGPU::SI_SPILL_AV32_CFI_SAVE:
case AMDGPU::SI_SPILL_AV32_RESTORE:
case AMDGPU::SI_SPILL_WWM_V32_SAVE:
case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
@@ -1389,14 +1430,14 @@ static int getOffenMUBUFLoad(unsigned Opc) {
}
}
-static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- int Index, unsigned Lane,
- unsigned ValueReg, bool IsKill) {
+static MachineInstrBuilder
+spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, int Index, unsigned Lane,
+ unsigned ValueReg, bool IsKill, bool NeedsCFI) {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIFrameLowering *TFL = ST.getFrameLowering();
MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
@@ -1419,6 +1460,8 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
.addReg(Src, getKillRegState(IsKill));
CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+ if (NeedsCFI)
+ TFL->buildCFIForVRegToVRegSpill(MBB, MI, DL, Src, Dst);
return CopyMIB;
}
unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
@@ -1427,6 +1470,8 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
.addReg(Src, getKillRegState(IsKill));
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+ if (NeedsCFI)
+ TFL->buildCFIForVRegToVRegSpill(MBB, MI, DL, Src, Dst);
return MIB;
}
@@ -1449,7 +1494,8 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
- if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
+ if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false, false)
+ .getInstr())
return true;
MachineInstrBuilder NewMI =
@@ -1514,12 +1560,13 @@ void SIRegisterInfo::buildSpillLoadStore(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
- RegScavenger *RS, LiveRegUnits *LiveUnits) const {
+ RegScavenger *RS, LiveRegUnits *LiveUnits, bool NeedsCFI) const {
assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
MachineFunction *MF = MBB.getParent();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
+ const SIFrameLowering *TFL = ST.getFrameLowering();
const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
@@ -1586,6 +1633,7 @@ void SIRegisterInfo::buildSpillLoadStore(
// last address(offset + Size) after spilling all the EltSize chunks.
int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
int64_t ScratchOffsetRegDelta = 0;
+ int64_t AdditionalCFIOffset = 0;
if (IsFlat && EltSize > 4) {
LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
@@ -1698,6 +1746,7 @@ void SIRegisterInfo::buildSpillLoadStore(
Scavenged = true;
}
+ AdditionalCFIOffset = Offset;
// We currently only support spilling VGPRs to EltSize boundaries, meaning
// we can simplify the adjustment of Offset here to just scale with
// WavefrontSize.
@@ -1816,7 +1865,8 @@ void SIRegisterInfo::buildSpillLoadStore(
Register Sub = IsSubReg
? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
: ValueReg;
- auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
+ auto MIB =
+ spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill, NeedsCFI);
if (!MIB.getInstr())
break;
if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
@@ -1940,6 +1990,18 @@ void SIRegisterInfo::buildSpillLoadStore(
ValueReg = FinalValueReg;
}
+ if (IsStore && NeedsCFI) {
+ if (TII->isBlockLoadStore(LoadStoreOp)) {
+ assert(RegOffset == 0 &&
+ "expected whole register block to be treated as single element");
+ buildCFIForBlockCSRStore(MBB, MI, ValueReg, Offset);
+ } else {
+ TFL->buildCFIForVGPRToVMEMSpill(
+ MBB, MI, DebugLoc(), SubReg,
+ (Offset + RegOffset) * ST.getWavefrontSize() + AdditionalCFIOffset);
+ }
+ }
+
if (!IsAGPR && NeedSuperRegDef)
MIB.addReg(ValueReg, RegState::ImplicitDefine);
@@ -2015,6 +2077,31 @@ void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB,
MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
}
+void SIRegisterInfo::buildCFIForBlockCSRStore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ Register BlockReg,
+ int64_t Offset) const {
+ const MachineFunction *MF = MBB.getParent();
+ const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
+ Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
+ for (unsigned RegOffset = 0; RegOffset < 32; ++RegOffset) {
+ Register VGPR = BaseVGPR + RegOffset;
+ if (Mask & (1 << RegOffset)) {
+ assert(isCalleeSavedPhysReg(VGPR, *MF));
+ ST.getFrameLowering()->buildCFIForVGPRToVMEMSpill(
+ MBB, MBBI, DebugLoc(), VGPR,
+ (Offset + RegOffset) * ST.getWavefrontSize());
+ } else if (isCalleeSavedPhysReg(VGPR, *MF)) {
+ // FIXME: This is a workaround for the fact that FrameLowering's
+ // emitPrologueEntryCFI considers the block load to clobber all registers
+ // in the block.
+ ST.getFrameLowering()->buildCFIForSameValue(MBB, MBBI, DebugLoc(),
+ BaseVGPR + RegOffset);
+ }
+ }
+}
+
void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
int Offset, bool IsLoad,
bool IsKill) const {
@@ -2053,7 +2140,7 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
RegScavenger *RS, SlotIndexes *Indexes,
LiveIntervals *LIS, bool OnlyToVGPR,
- bool SpillToPhysVGPRLane) const {
+ bool SpillToPhysVGPRLane, bool NeedsCFI) const {
assert(!MI->getOperand(0).isUndef() &&
"undef spill should have been deleted earlier");
@@ -2066,6 +2153,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
if (OnlyToVGPR && !SpillToVGPR)
return false;
+ const SIFrameLowering *TFL = ST.getFrameLowering();
+
assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
SB.SuperReg != SB.MFI.getFrameOffsetReg()));
@@ -2098,11 +2187,27 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
.addReg(SubReg, getKillRegState(UseKill))
.addImm(Spill.Lane)
.addReg(Spill.VGPR);
+
+ MachineInstr *CFI = nullptr;
+ if (NeedsCFI) {
+ if (SB.SuperReg == SB.TRI.getReturnAddressReg(SB.MF)) {
+ if (i == e - 1)
+ CFI = TFL->buildCFIForSGPRToVGPRSpill(*SB.MBB, MI, DebugLoc(),
+ AMDGPU::PC_REG, VGPRSpills);
+ } else {
+ CFI = TFL->buildCFIForSGPRToVGPRSpill(*SB.MBB, MI, DebugLoc(), SubReg,
+ Spill.VGPR, Spill.Lane);
+ }
+ }
+
if (Indexes) {
if (IsFirstSubreg)
Indexes->replaceMachineInstrInMaps(*MI, *MIB);
else
Indexes->insertMachineInstrInMaps(*MIB);
+
+ if (CFI)
+ Indexes->insertMachineInstrInMaps(*CFI);
}
if (IsFirstSubreg && SB.NumSubRegs > 1) {
@@ -2167,6 +2272,18 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
// Write out VGPR
SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
+
+ // TODO: Implement CFI for SpillToVMEM for all scenarios.
+ MachineInstr *CFI = nullptr;
+ if (NeedsCFI && SB.SuperReg == SB.TRI.getReturnAddressReg(SB.MF)) {
+ int64_t CFIOffset = (Offset * SB.EltSize +
+ SB.MF.getFrameInfo().getObjectOffset(Index)) *
+ ST.getWavefrontSize();
+ CFI = TFL->buildCFIForSGPRToVMEMSpill(*SB.MBB, MI, DebugLoc(),
+ AMDGPU::PC_REG, CFIOffset);
+ }
+ if (Indexes && CFI)
+ Indexes->insertMachineInstrInMaps(*CFI);
}
SB.restore();
@@ -2338,7 +2455,20 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
+ bool NeedsCFI = false;
switch (MI->getOpcode()) {
+ case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S512_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S256_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S224_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S192_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S160_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S128_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S96_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S64_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S32_CFI_SAVE:
+ NeedsCFI = true;
+ [[fallthrough]];
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S384_SAVE:
@@ -2353,7 +2483,8 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
- return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
+ return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane,
+ NeedsCFI);
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S384_RESTORE:
@@ -2396,8 +2527,23 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
? getBaseRegister()
: getFrameRegister(*MF);
+ bool NeedsCFI = false;
+
switch (MI->getOpcode()) {
// SGPR register spill
+ case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S512_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S256_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S224_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S192_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S160_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S128_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S96_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S64_CFI_SAVE:
+ case AMDGPU::SI_SPILL_S32_CFI_SAVE: {
+ NeedsCFI = true;
+ [[fallthrough]];
+ }
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S384_SAVE:
@@ -2412,7 +2558,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE: {
- return spillSGPR(MI, Index, RS);
+ return spillSGPR(MI, Index, RS, nullptr, nullptr, false, false, NeedsCFI);
}
// SGPR register restore
@@ -2434,13 +2580,40 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
// VGPR register spill
- case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
- // Put mask into M0.
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
- AMDGPU::M0)
- .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
+ case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V512_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V256_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V224_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V192_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V160_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V128_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V96_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V64_CFI_SAVE:
+ case AMDGPU::SI_SPILL_V32_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A1024_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A512_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A256_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A224_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A192_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A160_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A128_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A96_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A64_CFI_SAVE:
+ case AMDGPU::SI_SPILL_A32_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV1024_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV512_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV256_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV224_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV192_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV160_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV128_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV96_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV64_CFI_SAVE:
+ case AMDGPU::SI_SPILL_AV32_CFI_SAVE:
+ NeedsCFI = true;
[[fallthrough]];
- }
+ case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V384_SAVE:
@@ -2486,6 +2659,16 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_AV32_SAVE:
case AMDGPU::SI_SPILL_WWM_V32_SAVE:
case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
+ assert(
+ MI->getOpcode() != AMDGPU::SI_BLOCK_SPILL_V1024_SAVE &&
+ "block spill does not currenty support spilling non-CSR registers");
+
+ if (MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE)
+ // Put mask into M0.
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
+ AMDGPU::M0)
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
+
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
if (VData->isUndef()) {
@@ -2501,7 +2684,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
} else {
- Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
+ Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE
? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
: ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
@@ -2511,12 +2694,12 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
if (IsWWMRegSpill) {
TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
- RS->isRegUsed(AMDGPU::SCC));
+ RS->isRegUsed(AMDGPU::SCC));
}
buildSpillLoadStore(
*MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
- *MI->memoperands_begin(), RS);
+ *MI->memoperands_begin(), RS, nullptr, NeedsCFI);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(*MI, TII));
if (IsWWMRegSpill)
TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 9d1a9eae75020..c21da9a8bfb7f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -119,6 +119,13 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB,
Register BlockReg) const;
+ // Iterate over all VGPRs in the given BlockReg and emit CFI for each VGPR
+ // as-needed depending on the (statically known) mask, relative to the given
+ // base Offset.
+ void buildCFIForBlockCSRStore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ Register BlockReg, int64_t Offset) const;
+
const TargetRegisterClass *
getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const override;
@@ -174,8 +181,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
/// free VGPR lane to spill.
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
- bool OnlyToVGPR = false,
- bool SpillToPhysVGPRLane = false) const;
+ bool OnlyToVGPR = false, bool SpillToPhysVGPRLane = false,
+ bool NeedsCFI = false) const;
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
@@ -450,8 +457,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned LoadStoreOp, int Index, Register ValueReg,
bool ValueIsKill, MCRegister ScratchOffsetReg,
int64_t InstrOffset, MachineMemOperand *MMO,
- RegScavenger *RS,
- LiveRegUnits *LiveUnits = nullptr) const;
+ RegScavenger *RS, LiveRegUnits *LiveUnits = nullptr,
+ bool NeedsCFI = false) const;
// Return alignment in register file of first register in a register tuple.
unsigned getRegClassAlignmentNumBits(const TargetRegisterClass *RC) const {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index e42f9e8cb1001..e2f01d9e6c69f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -13,14 +13,14 @@ define ptr addrspace(1) @call_assert_align() #0 {
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, ext at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, ext at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index faa54d4209f8e..38cc5e3778520 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -222,20 +222,20 @@ define void @func_caller_stack() #2 {
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
-; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; MUBUF-NEXT: v_mov_b32_e32 v0, 11
-; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; MUBUF-NEXT: v_mov_b32_e32 v0, 12
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
@@ -257,8 +257,10 @@ define void @func_caller_stack() #2 {
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
-; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_add_u32 s0, s32, 4
; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
@@ -270,12 +272,10 @@ define void @func_caller_stack() #2 {
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s0, s32, 16
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
@@ -300,15 +300,15 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
-; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
-; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
-; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_waitcnt vmcnt(1)
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
; MUBUF-NEXT: s_waitcnt vmcnt(1)
@@ -382,14 +382,14 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
-; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off
-; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 8225ea1f8fda7..614ca4a214f0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -236,14 +236,14 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 3dac24ed89fa0..1b7ef739c9479 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -353,7 +353,6 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
@@ -370,6 +369,7 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
@@ -480,7 +480,6 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
@@ -497,6 +496,7 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
@@ -4100,7 +4100,6 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
@@ -4117,6 +4116,7 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
@@ -4225,7 +4225,6 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
@@ -4242,6 +4241,7 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
index 37cad3c4596d8..3400f2e798194 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -353,7 +353,6 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX90A-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
@@ -370,6 +369,7 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
@@ -480,7 +480,6 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
@@ -497,6 +496,7 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
@@ -3081,7 +3081,6 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX90A-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
@@ -3098,6 +3097,7 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
@@ -3206,7 +3206,6 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX950-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
@@ -3223,6 +3222,7 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index bb29fb8757f0f..ca96693fc44e9 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -23,12 +23,12 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; FIXEDABI-NEXT: s_mov_b64 exec, s[18:19]
; FIXEDABI-NEXT: v_writelane_b32 v40, s16, 2
-; FIXEDABI-NEXT: s_addk_i32 s32, 0x400
; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0
+; FIXEDABI-NEXT: s_addk_i32 s32, 0x400
+; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_getpc_b64 s[16:17]
; FIXEDABI-NEXT: s_add_u32 s16, s16, requires_all_inputs at rel32@lo+4
; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
-; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
index 738324b3749e8..ac52ca7012561 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
@@ -367,7 +367,7 @@ body: |
; GFX90A-LABEL: name: agpr32_restore_clobber_scc
; GFX90A: bb.0:
; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -514,229 +514,453 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr80, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 700, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr81, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 696, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr82, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 692, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr83, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 688, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr84, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 684, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr85, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 680, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr86, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 676, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr87, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 672, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr88, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 668, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr89, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 664, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr90, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 660, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr91, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 656, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr92, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 652, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr93, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 648, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr94, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 644, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr95, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 640, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr96, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 636, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr97, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 632, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr98, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 628, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr99, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 624, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr100, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 620, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr101, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 616, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr102, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 612, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr103, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 608, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr104, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 604, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr105, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 600, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr106, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 596, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr107, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 592, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr108, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 588, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr109, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 584, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr110, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 580, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr111, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 576, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr112, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 572, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr113, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 568, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr114, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 564, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr115, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 560, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr116, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 556, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr117, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 552, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr118, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 548, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr119, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 544, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr120, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 540, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr121, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 536, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr122, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 532, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr123, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 528, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr124, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 524, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr125, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 520, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr126, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 516, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr127, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 512, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr128, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 508, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr129, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 504, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr130, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 500, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr131, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 496, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr132, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 492, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr133, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 488, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr134, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 484, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr135, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 480, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr136, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 476, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr137, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 472, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr138, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 468, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr139, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 464, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr140, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 460, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr141, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 456, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr142, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 452, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr143, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 448, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr144, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 444, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr145, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 440, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr146, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 436, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr147, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 432, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr148, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 428, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr149, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 424, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr150, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 420, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr151, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 416, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr152, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 412, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr153, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 408, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr154, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 404, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr155, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 400, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr156, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 396, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr157, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 392, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr158, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 388, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr159, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 384, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr160, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 380, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr161, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 376, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr162, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 372, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr163, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 368, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr164, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 364, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr165, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 360, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr166, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 356, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr167, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 352, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr168, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 348, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr169, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 344, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr170, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 340, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr171, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 336, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr172, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 332, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr173, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 328, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr174, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 324, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr175, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 320, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr176, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 316, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr177, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 312, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr178, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 308, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr179, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 304, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr180, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 300, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr181, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 296, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr182, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 292, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr183, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 288, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr184, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 284, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr185, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 280, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr186, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 276, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr187, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 272, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr188, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 268, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr189, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 264, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr190, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 260, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr191, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 256, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr192, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 252, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr193, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 248, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr194, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 244, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr195, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 240, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr196, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 236, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr197, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 232, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr198, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 228, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr199, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 224, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr200, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 220, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr201, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 216, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr202, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 212, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr203, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 208, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr204, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 204, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr205, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 200, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr206, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 196, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr207, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 192, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr208, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 188, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr209, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 184, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr210, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 180, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr211, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 176, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr212, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 172, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr213, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 168, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr214, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 164, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr215, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 160, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr216, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 156, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr217, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 152, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr218, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 148, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr219, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 144, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr220, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 140, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr221, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 136, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr222, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr223, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 128, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr224, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr225, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr226, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr227, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr228, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr229, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr230, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr231, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr232, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr233, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr234, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr235, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr236, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr237, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr238, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr239, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr240, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr241, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr242, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr243, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr244, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr245, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr246, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr247, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr248, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr249, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr250, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr251, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr252, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr253, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr254, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr255, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 704, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-NEXT: $vgpr40 = V_MOV_B32_e32 8904, implicit $exec
@@ -1326,7 +1550,7 @@ body: |
; GFX90A-FLATSCR-LABEL: name: agpr32_restore_clobber_scc
; GFX90A-FLATSCR: bb.0:
; GFX90A-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-FLATSCR-NEXT: {{ $}}
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -1473,229 +1697,453 @@ body: |
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr80, $sgpr32, 700, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr81, $sgpr32, 696, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr82, $sgpr32, 692, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr83, $sgpr32, 688, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr84, $sgpr32, 684, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr85, $sgpr32, 680, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr86, $sgpr32, 676, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr87, $sgpr32, 672, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr88, $sgpr32, 668, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr89, $sgpr32, 664, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr90, $sgpr32, 660, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr91, $sgpr32, 656, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr92, $sgpr32, 652, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr93, $sgpr32, 648, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr94, $sgpr32, 644, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr95, $sgpr32, 640, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr96, $sgpr32, 636, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr97, $sgpr32, 632, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr98, $sgpr32, 628, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr99, $sgpr32, 624, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr100, $sgpr32, 620, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr101, $sgpr32, 616, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr102, $sgpr32, 612, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr103, $sgpr32, 608, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr104, $sgpr32, 604, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr105, $sgpr32, 600, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr106, $sgpr32, 596, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr107, $sgpr32, 592, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr108, $sgpr32, 588, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr109, $sgpr32, 584, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr110, $sgpr32, 580, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr111, $sgpr32, 576, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr112, $sgpr32, 572, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr113, $sgpr32, 568, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr114, $sgpr32, 564, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr115, $sgpr32, 560, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr116, $sgpr32, 556, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr117, $sgpr32, 552, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr118, $sgpr32, 548, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr119, $sgpr32, 544, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr120, $sgpr32, 540, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr121, $sgpr32, 536, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr122, $sgpr32, 532, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr123, $sgpr32, 528, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr124, $sgpr32, 524, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr125, $sgpr32, 520, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr126, $sgpr32, 516, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr127, $sgpr32, 512, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr128, $sgpr32, 508, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr129, $sgpr32, 504, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr130, $sgpr32, 500, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr131, $sgpr32, 496, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr132, $sgpr32, 492, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr133, $sgpr32, 488, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr134, $sgpr32, 484, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr135, $sgpr32, 480, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr136, $sgpr32, 476, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr137, $sgpr32, 472, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr138, $sgpr32, 468, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr139, $sgpr32, 464, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr140, $sgpr32, 460, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr141, $sgpr32, 456, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr142, $sgpr32, 452, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr143, $sgpr32, 448, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr144, $sgpr32, 444, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr145, $sgpr32, 440, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr146, $sgpr32, 436, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr147, $sgpr32, 432, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr148, $sgpr32, 428, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr149, $sgpr32, 424, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr150, $sgpr32, 420, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr151, $sgpr32, 416, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr152, $sgpr32, 412, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr153, $sgpr32, 408, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr154, $sgpr32, 404, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr155, $sgpr32, 400, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr156, $sgpr32, 396, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr157, $sgpr32, 392, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr158, $sgpr32, 388, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr159, $sgpr32, 384, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr160, $sgpr32, 380, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr161, $sgpr32, 376, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr162, $sgpr32, 372, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr163, $sgpr32, 368, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr164, $sgpr32, 364, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr165, $sgpr32, 360, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr166, $sgpr32, 356, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr167, $sgpr32, 352, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr168, $sgpr32, 348, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr169, $sgpr32, 344, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr170, $sgpr32, 340, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr171, $sgpr32, 336, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr172, $sgpr32, 332, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr173, $sgpr32, 328, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr174, $sgpr32, 324, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr175, $sgpr32, 320, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr176, $sgpr32, 316, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr177, $sgpr32, 312, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr178, $sgpr32, 308, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr179, $sgpr32, 304, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr180, $sgpr32, 300, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr181, $sgpr32, 296, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr182, $sgpr32, 292, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr183, $sgpr32, 288, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr184, $sgpr32, 284, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr185, $sgpr32, 280, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr186, $sgpr32, 276, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr187, $sgpr32, 272, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr188, $sgpr32, 268, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr189, $sgpr32, 264, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr190, $sgpr32, 260, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr191, $sgpr32, 256, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr192, $sgpr32, 252, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr193, $sgpr32, 248, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr194, $sgpr32, 244, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr195, $sgpr32, 240, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr196, $sgpr32, 236, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr197, $sgpr32, 232, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr198, $sgpr32, 228, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr199, $sgpr32, 224, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr200, $sgpr32, 220, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr201, $sgpr32, 216, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr202, $sgpr32, 212, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr203, $sgpr32, 208, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr204, $sgpr32, 204, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr205, $sgpr32, 200, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr206, $sgpr32, 196, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr207, $sgpr32, 192, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr208, $sgpr32, 188, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr209, $sgpr32, 184, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr210, $sgpr32, 180, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr211, $sgpr32, 176, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr212, $sgpr32, 172, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr213, $sgpr32, 168, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr214, $sgpr32, 164, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr215, $sgpr32, 160, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr216, $sgpr32, 156, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr217, $sgpr32, 152, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr218, $sgpr32, 148, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr219, $sgpr32, 144, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr220, $sgpr32, 140, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr221, $sgpr32, 136, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr222, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr223, $sgpr32, 128, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr224, $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr225, $sgpr32, 120, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr226, $sgpr32, 116, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr227, $sgpr32, 112, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr228, $sgpr32, 108, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr229, $sgpr32, 104, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr230, $sgpr32, 100, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr231, $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr232, $sgpr32, 92, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr233, $sgpr32, 88, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr234, $sgpr32, 84, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr235, $sgpr32, 80, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr236, $sgpr32, 76, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr237, $sgpr32, 72, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr238, $sgpr32, 68, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr239, $sgpr32, 64, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr240, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr241, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr242, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr243, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr244, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr245, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr246, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr247, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr248, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr249, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr250, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr251, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr252, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr253, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr254, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr255, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 704, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-FLATSCR-NEXT: $vgpr40 = V_MOV_B32_e32 $sgpr32, implicit $exec
@@ -2315,7 +2763,7 @@ body: |
; GFX90A-LABEL: name: agpr64_restore_clobber_scc
; GFX90A: bb.0:
; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -2462,229 +2910,453 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr80, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 700, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr81, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 696, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr82, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 692, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr83, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 688, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr84, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 684, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr85, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 680, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr86, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 676, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr87, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 672, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr88, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 668, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr89, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 664, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr90, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 660, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr91, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 656, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr92, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 652, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr93, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 648, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr94, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 644, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr95, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 640, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr96, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 636, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr97, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 632, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr98, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 628, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr99, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 624, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr100, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 620, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr101, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 616, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr102, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 612, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr103, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 608, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr104, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 604, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr105, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 600, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr106, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 596, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr107, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 592, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr108, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 588, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr109, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 584, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr110, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 580, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr111, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 576, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr112, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 572, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr113, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 568, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr114, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 564, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr115, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 560, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr116, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 556, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr117, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 552, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr118, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 548, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr119, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 544, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr120, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 540, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr121, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 536, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr122, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 532, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr123, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 528, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr124, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 524, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr125, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 520, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr126, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 516, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr127, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 512, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr128, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 508, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr129, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 504, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr130, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 500, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr131, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 496, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr132, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 492, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr133, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 488, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr134, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 484, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr135, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 480, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr136, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 476, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr137, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 472, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr138, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 468, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr139, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 464, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr140, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 460, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr141, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 456, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr142, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 452, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr143, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 448, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr144, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 444, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr145, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 440, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr146, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 436, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr147, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 432, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr148, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 428, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr149, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 424, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr150, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 420, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr151, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 416, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr152, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 412, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr153, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 408, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr154, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 404, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr155, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 400, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr156, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 396, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr157, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 392, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr158, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 388, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr159, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 384, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr160, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 380, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr161, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 376, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr162, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 372, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr163, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 368, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr164, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 364, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr165, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 360, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr166, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 356, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr167, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 352, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr168, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 348, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr169, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 344, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr170, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 340, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr171, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 336, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr172, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 332, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr173, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 328, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr174, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 324, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr175, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 320, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr176, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 316, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr177, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 312, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr178, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 308, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr179, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 304, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr180, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 300, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr181, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 296, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr182, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 292, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr183, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 288, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr184, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 284, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr185, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 280, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr186, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 276, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr187, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 272, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr188, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 268, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr189, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 264, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr190, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 260, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr191, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 256, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr192, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 252, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr193, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 248, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr194, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 244, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr195, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 240, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr196, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 236, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr197, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 232, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr198, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 228, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr199, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 224, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr200, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 220, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr201, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 216, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr202, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 212, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr203, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 208, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr204, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 204, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr205, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 200, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr206, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 196, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr207, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 192, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr208, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 188, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr209, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 184, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr210, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 180, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr211, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 176, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr212, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 172, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr213, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 168, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr214, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 164, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr215, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 160, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr216, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 156, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr217, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 152, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr218, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 148, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr219, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 144, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr220, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 140, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr221, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 136, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr222, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr223, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 128, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr224, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr225, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr226, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr227, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr228, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr229, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr230, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr231, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr232, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr233, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr234, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr235, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr236, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr237, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr238, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr239, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr240, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr241, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr242, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr243, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr244, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr245, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr246, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr247, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr248, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr249, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr250, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr251, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr252, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr253, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr254, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr255, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 704, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-NEXT: $vgpr40 = V_MOV_B32_e32 8904, implicit $exec
@@ -3277,7 +3949,7 @@ body: |
; GFX90A-FLATSCR-LABEL: name: agpr64_restore_clobber_scc
; GFX90A-FLATSCR: bb.0:
; GFX90A-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-FLATSCR-NEXT: {{ $}}
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -3424,229 +4096,453 @@ body: |
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr80, $sgpr32, 700, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr81, $sgpr32, 696, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr82, $sgpr32, 692, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr83, $sgpr32, 688, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr84, $sgpr32, 684, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr85, $sgpr32, 680, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr86, $sgpr32, 676, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr87, $sgpr32, 672, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr88, $sgpr32, 668, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr89, $sgpr32, 664, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr90, $sgpr32, 660, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr91, $sgpr32, 656, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr92, $sgpr32, 652, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr93, $sgpr32, 648, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr94, $sgpr32, 644, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr95, $sgpr32, 640, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr96, $sgpr32, 636, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr97, $sgpr32, 632, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr98, $sgpr32, 628, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr99, $sgpr32, 624, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr100, $sgpr32, 620, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr101, $sgpr32, 616, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr102, $sgpr32, 612, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr103, $sgpr32, 608, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr104, $sgpr32, 604, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr105, $sgpr32, 600, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr106, $sgpr32, 596, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr107, $sgpr32, 592, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr108, $sgpr32, 588, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr109, $sgpr32, 584, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr110, $sgpr32, 580, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr111, $sgpr32, 576, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr112, $sgpr32, 572, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr113, $sgpr32, 568, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr114, $sgpr32, 564, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr115, $sgpr32, 560, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr116, $sgpr32, 556, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr117, $sgpr32, 552, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr118, $sgpr32, 548, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr119, $sgpr32, 544, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr120, $sgpr32, 540, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr121, $sgpr32, 536, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr122, $sgpr32, 532, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr123, $sgpr32, 528, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr124, $sgpr32, 524, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr125, $sgpr32, 520, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr126, $sgpr32, 516, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr127, $sgpr32, 512, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr128, $sgpr32, 508, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr129, $sgpr32, 504, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr130, $sgpr32, 500, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr131, $sgpr32, 496, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr132, $sgpr32, 492, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr133, $sgpr32, 488, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr134, $sgpr32, 484, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr135, $sgpr32, 480, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr136, $sgpr32, 476, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr137, $sgpr32, 472, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr138, $sgpr32, 468, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr139, $sgpr32, 464, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr140, $sgpr32, 460, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr141, $sgpr32, 456, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr142, $sgpr32, 452, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr143, $sgpr32, 448, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr144, $sgpr32, 444, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr145, $sgpr32, 440, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr146, $sgpr32, 436, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr147, $sgpr32, 432, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr148, $sgpr32, 428, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr149, $sgpr32, 424, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr150, $sgpr32, 420, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr151, $sgpr32, 416, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr152, $sgpr32, 412, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr153, $sgpr32, 408, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr154, $sgpr32, 404, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr155, $sgpr32, 400, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr156, $sgpr32, 396, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr157, $sgpr32, 392, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr158, $sgpr32, 388, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr159, $sgpr32, 384, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr160, $sgpr32, 380, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr161, $sgpr32, 376, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr162, $sgpr32, 372, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr163, $sgpr32, 368, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr164, $sgpr32, 364, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr165, $sgpr32, 360, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr166, $sgpr32, 356, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr167, $sgpr32, 352, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr168, $sgpr32, 348, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr169, $sgpr32, 344, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr170, $sgpr32, 340, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr171, $sgpr32, 336, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr172, $sgpr32, 332, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr173, $sgpr32, 328, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr174, $sgpr32, 324, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr175, $sgpr32, 320, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr176, $sgpr32, 316, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr177, $sgpr32, 312, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr178, $sgpr32, 308, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr179, $sgpr32, 304, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr180, $sgpr32, 300, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr181, $sgpr32, 296, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr182, $sgpr32, 292, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr183, $sgpr32, 288, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr184, $sgpr32, 284, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr185, $sgpr32, 280, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr186, $sgpr32, 276, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr187, $sgpr32, 272, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr188, $sgpr32, 268, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr189, $sgpr32, 264, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr190, $sgpr32, 260, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr191, $sgpr32, 256, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr192, $sgpr32, 252, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr193, $sgpr32, 248, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr194, $sgpr32, 244, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr195, $sgpr32, 240, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr196, $sgpr32, 236, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr197, $sgpr32, 232, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr198, $sgpr32, 228, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr199, $sgpr32, 224, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr200, $sgpr32, 220, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr201, $sgpr32, 216, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr202, $sgpr32, 212, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr203, $sgpr32, 208, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr204, $sgpr32, 204, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr205, $sgpr32, 200, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr206, $sgpr32, 196, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr207, $sgpr32, 192, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr208, $sgpr32, 188, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr209, $sgpr32, 184, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr210, $sgpr32, 180, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr211, $sgpr32, 176, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr212, $sgpr32, 172, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr213, $sgpr32, 168, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr214, $sgpr32, 164, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr215, $sgpr32, 160, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr216, $sgpr32, 156, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr217, $sgpr32, 152, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr218, $sgpr32, 148, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr219, $sgpr32, 144, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr220, $sgpr32, 140, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr221, $sgpr32, 136, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr222, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr223, $sgpr32, 128, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr224, $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr225, $sgpr32, 120, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr226, $sgpr32, 116, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr227, $sgpr32, 112, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr228, $sgpr32, 108, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr229, $sgpr32, 104, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr230, $sgpr32, 100, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr231, $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr232, $sgpr32, 92, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr233, $sgpr32, 88, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr234, $sgpr32, 84, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr235, $sgpr32, 80, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr236, $sgpr32, 76, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr237, $sgpr32, 72, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr238, $sgpr32, 68, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr239, $sgpr32, 64, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr240, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr241, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr242, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr243, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr244, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr245, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr246, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr247, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr248, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr249, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr250, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr251, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr252, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr253, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr254, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr255, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 704, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-FLATSCR-NEXT: $vgpr40 = V_MOV_B32_e32 $sgpr32, implicit $exec
@@ -4268,7 +5164,7 @@ body: |
; GFX90A-LABEL: name: agpr96_restore_clobber_scc
; GFX90A: bb.0:
; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -4415,229 +5311,453 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr80, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 700, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr81, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 696, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr82, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 692, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr83, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 688, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr84, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 684, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr85, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 680, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr86, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 676, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr87, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 672, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr88, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 668, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr89, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 664, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr90, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 660, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr91, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 656, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr92, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 652, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr93, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 648, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr94, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 644, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr95, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 640, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr96, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 636, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr97, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 632, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr98, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 628, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr99, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 624, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr100, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 620, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr101, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 616, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr102, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 612, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr103, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 608, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr104, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 604, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr105, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 600, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr106, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 596, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr107, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 592, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr108, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 588, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr109, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 584, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr110, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 580, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr111, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 576, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr112, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 572, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr113, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 568, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr114, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 564, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr115, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 560, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr116, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 556, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr117, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 552, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr118, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 548, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr119, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 544, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr120, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 540, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr121, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 536, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr122, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 532, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr123, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 528, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr124, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 524, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr125, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 520, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr126, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 516, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr127, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 512, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr128, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 508, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr129, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 504, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr130, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 500, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr131, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 496, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr132, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 492, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr133, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 488, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr134, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 484, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr135, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 480, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr136, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 476, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr137, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 472, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr138, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 468, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr139, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 464, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr140, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 460, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr141, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 456, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr142, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 452, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr143, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 448, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr144, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 444, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr145, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 440, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr146, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 436, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr147, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 432, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr148, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 428, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr149, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 424, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr150, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 420, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr151, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 416, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr152, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 412, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr153, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 408, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr154, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 404, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr155, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 400, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr156, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 396, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr157, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 392, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr158, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 388, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr159, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 384, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr160, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 380, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr161, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 376, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr162, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 372, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr163, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 368, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr164, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 364, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr165, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 360, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr166, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 356, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr167, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 352, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr168, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 348, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr169, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 344, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr170, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 340, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr171, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 336, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr172, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 332, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr173, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 328, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr174, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 324, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr175, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 320, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr176, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 316, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr177, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 312, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr178, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 308, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr179, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 304, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr180, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 300, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr181, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 296, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr182, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 292, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr183, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 288, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr184, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 284, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr185, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 280, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr186, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 276, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr187, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 272, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr188, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 268, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr189, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 264, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr190, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 260, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr191, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 256, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr192, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 252, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr193, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 248, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr194, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 244, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr195, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 240, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr196, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 236, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr197, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 232, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr198, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 228, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr199, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 224, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr200, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 220, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr201, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 216, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr202, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 212, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr203, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 208, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr204, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 204, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr205, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 200, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr206, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 196, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr207, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 192, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr208, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 188, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr209, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 184, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr210, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 180, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr211, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 176, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr212, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 172, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr213, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 168, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr214, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 164, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr215, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 160, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr216, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 156, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr217, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 152, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr218, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 148, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr219, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 144, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr220, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 140, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr221, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 136, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr222, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr223, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 128, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr224, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr225, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr226, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr227, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr228, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr229, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr230, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr231, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr232, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr233, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr234, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr235, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr236, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr237, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr238, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr239, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr240, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr241, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr242, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr243, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr244, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr245, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr246, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr247, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr248, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr249, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr250, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr251, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr252, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr253, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr254, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr255, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 704, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-NEXT: $vgpr40 = V_MOV_B32_e32 8904, implicit $exec
@@ -5233,7 +6353,7 @@ body: |
; GFX90A-FLATSCR-LABEL: name: agpr96_restore_clobber_scc
; GFX90A-FLATSCR: bb.0:
; GFX90A-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-FLATSCR-NEXT: {{ $}}
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -5380,229 +6500,453 @@ body: |
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr80, $sgpr32, 700, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr81, $sgpr32, 696, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr82, $sgpr32, 692, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr83, $sgpr32, 688, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr84, $sgpr32, 684, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr85, $sgpr32, 680, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr86, $sgpr32, 676, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr87, $sgpr32, 672, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr88, $sgpr32, 668, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr89, $sgpr32, 664, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr90, $sgpr32, 660, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr91, $sgpr32, 656, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr92, $sgpr32, 652, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr93, $sgpr32, 648, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr94, $sgpr32, 644, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr95, $sgpr32, 640, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr96, $sgpr32, 636, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr97, $sgpr32, 632, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr98, $sgpr32, 628, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr99, $sgpr32, 624, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr100, $sgpr32, 620, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr101, $sgpr32, 616, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr102, $sgpr32, 612, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr103, $sgpr32, 608, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr104, $sgpr32, 604, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr105, $sgpr32, 600, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr106, $sgpr32, 596, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr107, $sgpr32, 592, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr108, $sgpr32, 588, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr109, $sgpr32, 584, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr110, $sgpr32, 580, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr111, $sgpr32, 576, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr112, $sgpr32, 572, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr113, $sgpr32, 568, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr114, $sgpr32, 564, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr115, $sgpr32, 560, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr116, $sgpr32, 556, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr117, $sgpr32, 552, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr118, $sgpr32, 548, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr119, $sgpr32, 544, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr120, $sgpr32, 540, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr121, $sgpr32, 536, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr122, $sgpr32, 532, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr123, $sgpr32, 528, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr124, $sgpr32, 524, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr125, $sgpr32, 520, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr126, $sgpr32, 516, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr127, $sgpr32, 512, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr128, $sgpr32, 508, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr129, $sgpr32, 504, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr130, $sgpr32, 500, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr131, $sgpr32, 496, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr132, $sgpr32, 492, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr133, $sgpr32, 488, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr134, $sgpr32, 484, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr135, $sgpr32, 480, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr136, $sgpr32, 476, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr137, $sgpr32, 472, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr138, $sgpr32, 468, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr139, $sgpr32, 464, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr140, $sgpr32, 460, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr141, $sgpr32, 456, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr142, $sgpr32, 452, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr143, $sgpr32, 448, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr144, $sgpr32, 444, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr145, $sgpr32, 440, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr146, $sgpr32, 436, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr147, $sgpr32, 432, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr148, $sgpr32, 428, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr149, $sgpr32, 424, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr150, $sgpr32, 420, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr151, $sgpr32, 416, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr152, $sgpr32, 412, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr153, $sgpr32, 408, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr154, $sgpr32, 404, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr155, $sgpr32, 400, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr156, $sgpr32, 396, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr157, $sgpr32, 392, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr158, $sgpr32, 388, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr159, $sgpr32, 384, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr160, $sgpr32, 380, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr161, $sgpr32, 376, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr162, $sgpr32, 372, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr163, $sgpr32, 368, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr164, $sgpr32, 364, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr165, $sgpr32, 360, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr166, $sgpr32, 356, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr167, $sgpr32, 352, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr168, $sgpr32, 348, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr169, $sgpr32, 344, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr170, $sgpr32, 340, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr171, $sgpr32, 336, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr172, $sgpr32, 332, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr173, $sgpr32, 328, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr174, $sgpr32, 324, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr175, $sgpr32, 320, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr176, $sgpr32, 316, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr177, $sgpr32, 312, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr178, $sgpr32, 308, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr179, $sgpr32, 304, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr180, $sgpr32, 300, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr181, $sgpr32, 296, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr182, $sgpr32, 292, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr183, $sgpr32, 288, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr184, $sgpr32, 284, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr185, $sgpr32, 280, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr186, $sgpr32, 276, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr187, $sgpr32, 272, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr188, $sgpr32, 268, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr189, $sgpr32, 264, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr190, $sgpr32, 260, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr191, $sgpr32, 256, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr192, $sgpr32, 252, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr193, $sgpr32, 248, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr194, $sgpr32, 244, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr195, $sgpr32, 240, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr196, $sgpr32, 236, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr197, $sgpr32, 232, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr198, $sgpr32, 228, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr199, $sgpr32, 224, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr200, $sgpr32, 220, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr201, $sgpr32, 216, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr202, $sgpr32, 212, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr203, $sgpr32, 208, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr204, $sgpr32, 204, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr205, $sgpr32, 200, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr206, $sgpr32, 196, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr207, $sgpr32, 192, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr208, $sgpr32, 188, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr209, $sgpr32, 184, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr210, $sgpr32, 180, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr211, $sgpr32, 176, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr212, $sgpr32, 172, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr213, $sgpr32, 168, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr214, $sgpr32, 164, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr215, $sgpr32, 160, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr216, $sgpr32, 156, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr217, $sgpr32, 152, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr218, $sgpr32, 148, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr219, $sgpr32, 144, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr220, $sgpr32, 140, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr221, $sgpr32, 136, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr222, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr223, $sgpr32, 128, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr224, $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr225, $sgpr32, 120, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr226, $sgpr32, 116, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr227, $sgpr32, 112, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr228, $sgpr32, 108, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr229, $sgpr32, 104, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr230, $sgpr32, 100, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr231, $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr232, $sgpr32, 92, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr233, $sgpr32, 88, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr234, $sgpr32, 84, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr235, $sgpr32, 80, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr236, $sgpr32, 76, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr237, $sgpr32, 72, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr238, $sgpr32, 68, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr239, $sgpr32, 64, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr240, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr241, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr242, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr243, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr244, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr245, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr246, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr247, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr248, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr249, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr250, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr251, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr252, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr253, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr254, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr255, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 704, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-FLATSCR-NEXT: $vgpr40 = V_MOV_B32_e32 $sgpr32, implicit $exec
@@ -6220,7 +7564,7 @@ body: |
; GFX90A-LABEL: name: agpr32_save_clobber_scc
; GFX90A: bb.0:
; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-NEXT: liveins: $agpr0, $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-NEXT: liveins: $agpr0, $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -6367,229 +7711,453 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr80, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 700, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr81, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 696, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr82, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 692, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr83, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 688, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr84, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 684, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr85, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 680, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr86, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 676, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr87, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 672, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr88, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 668, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr89, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 664, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr90, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 660, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr91, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 656, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr92, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 652, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr93, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 648, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr94, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 644, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr95, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 640, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr96, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 636, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr97, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 632, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr98, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 628, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr99, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 624, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr100, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 620, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr101, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 616, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr102, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 612, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr103, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 608, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr104, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 604, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr105, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 600, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr106, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 596, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr107, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 592, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr108, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 588, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr109, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 584, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr110, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 580, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr111, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 576, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr112, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 572, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr113, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 568, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr114, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 564, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr115, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 560, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr116, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 556, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr117, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 552, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr118, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 548, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr119, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 544, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr120, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 540, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr121, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 536, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr122, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 532, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr123, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 528, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr124, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 524, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr125, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 520, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr126, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 516, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr127, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 512, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr128, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 508, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr129, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 504, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr130, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 500, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr131, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 496, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr132, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 492, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr133, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 488, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr134, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 484, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr135, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 480, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr136, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 476, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr137, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 472, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr138, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 468, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr139, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 464, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr140, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 460, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr141, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 456, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr142, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 452, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr143, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 448, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr144, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 444, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr145, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 440, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr146, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 436, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr147, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 432, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr148, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 428, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr149, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 424, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr150, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 420, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr151, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 416, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr152, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 412, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr153, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 408, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr154, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 404, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr155, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 400, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr156, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 396, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr157, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 392, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr158, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 388, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr159, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 384, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr160, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 380, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr161, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 376, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr162, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 372, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr163, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 368, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr164, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 364, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr165, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 360, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr166, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 356, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr167, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 352, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr168, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 348, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr169, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 344, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr170, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 340, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr171, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 336, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr172, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 332, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr173, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 328, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr174, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 324, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr175, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 320, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr176, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 316, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr177, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 312, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr178, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 308, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr179, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 304, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr180, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 300, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr181, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 296, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr182, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 292, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr183, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 288, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr184, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 284, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr185, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 280, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr186, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 276, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr187, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 272, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr188, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 268, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr189, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 264, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr190, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 260, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr191, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 256, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr192, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 252, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr193, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 248, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr194, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 244, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr195, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 240, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr196, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 236, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr197, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 232, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr198, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 228, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr199, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 224, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr200, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 220, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr201, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 216, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr202, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 212, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr203, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 208, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr204, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 204, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr205, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 200, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr206, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 196, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr207, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 192, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr208, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 188, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr209, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 184, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr210, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 180, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr211, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 176, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr212, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 172, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr213, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 168, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr214, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 164, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr215, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 160, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr216, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 156, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr217, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 152, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr218, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 148, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr219, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 144, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr220, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 140, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr221, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 136, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr222, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr223, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 128, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr224, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr225, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr226, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr227, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr228, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr229, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr230, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr231, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr232, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr233, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr234, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr235, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr236, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr237, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr238, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr239, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr240, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr241, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr242, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr243, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr244, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr245, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr246, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr247, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr248, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr249, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr250, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr251, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr252, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr253, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr254, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr255, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 704, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-NEXT: $vgpr40 = V_MOV_B32_e32 8904, implicit $exec
@@ -7179,7 +8747,7 @@ body: |
; GFX90A-FLATSCR-LABEL: name: agpr32_save_clobber_scc
; GFX90A-FLATSCR: bb.0:
; GFX90A-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-FLATSCR-NEXT: liveins: $agpr0, $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-FLATSCR-NEXT: liveins: $agpr0, $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-FLATSCR-NEXT: {{ $}}
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -7326,229 +8894,453 @@ body: |
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr80, $sgpr32, 700, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr81, $sgpr32, 696, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr82, $sgpr32, 692, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr83, $sgpr32, 688, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr84, $sgpr32, 684, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr85, $sgpr32, 680, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr86, $sgpr32, 676, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr87, $sgpr32, 672, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr88, $sgpr32, 668, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr89, $sgpr32, 664, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr90, $sgpr32, 660, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr91, $sgpr32, 656, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr92, $sgpr32, 652, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr93, $sgpr32, 648, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr94, $sgpr32, 644, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr95, $sgpr32, 640, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr96, $sgpr32, 636, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr97, $sgpr32, 632, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr98, $sgpr32, 628, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr99, $sgpr32, 624, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr100, $sgpr32, 620, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr101, $sgpr32, 616, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr102, $sgpr32, 612, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr103, $sgpr32, 608, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr104, $sgpr32, 604, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr105, $sgpr32, 600, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr106, $sgpr32, 596, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr107, $sgpr32, 592, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr108, $sgpr32, 588, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr109, $sgpr32, 584, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr110, $sgpr32, 580, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr111, $sgpr32, 576, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr112, $sgpr32, 572, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr113, $sgpr32, 568, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr114, $sgpr32, 564, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr115, $sgpr32, 560, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr116, $sgpr32, 556, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr117, $sgpr32, 552, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr118, $sgpr32, 548, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr119, $sgpr32, 544, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr120, $sgpr32, 540, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr121, $sgpr32, 536, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr122, $sgpr32, 532, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr123, $sgpr32, 528, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr124, $sgpr32, 524, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr125, $sgpr32, 520, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr126, $sgpr32, 516, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr127, $sgpr32, 512, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr128, $sgpr32, 508, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr129, $sgpr32, 504, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr130, $sgpr32, 500, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr131, $sgpr32, 496, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr132, $sgpr32, 492, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr133, $sgpr32, 488, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr134, $sgpr32, 484, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr135, $sgpr32, 480, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr136, $sgpr32, 476, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr137, $sgpr32, 472, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr138, $sgpr32, 468, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr139, $sgpr32, 464, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr140, $sgpr32, 460, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr141, $sgpr32, 456, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr142, $sgpr32, 452, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr143, $sgpr32, 448, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr144, $sgpr32, 444, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr145, $sgpr32, 440, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr146, $sgpr32, 436, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr147, $sgpr32, 432, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr148, $sgpr32, 428, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr149, $sgpr32, 424, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr150, $sgpr32, 420, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr151, $sgpr32, 416, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr152, $sgpr32, 412, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr153, $sgpr32, 408, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr154, $sgpr32, 404, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr155, $sgpr32, 400, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr156, $sgpr32, 396, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr157, $sgpr32, 392, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr158, $sgpr32, 388, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr159, $sgpr32, 384, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr160, $sgpr32, 380, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr161, $sgpr32, 376, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr162, $sgpr32, 372, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr163, $sgpr32, 368, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr164, $sgpr32, 364, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr165, $sgpr32, 360, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr166, $sgpr32, 356, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr167, $sgpr32, 352, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr168, $sgpr32, 348, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr169, $sgpr32, 344, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr170, $sgpr32, 340, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr171, $sgpr32, 336, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr172, $sgpr32, 332, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr173, $sgpr32, 328, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr174, $sgpr32, 324, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr175, $sgpr32, 320, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr176, $sgpr32, 316, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr177, $sgpr32, 312, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr178, $sgpr32, 308, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr179, $sgpr32, 304, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr180, $sgpr32, 300, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr181, $sgpr32, 296, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr182, $sgpr32, 292, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr183, $sgpr32, 288, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr184, $sgpr32, 284, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr185, $sgpr32, 280, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr186, $sgpr32, 276, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr187, $sgpr32, 272, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr188, $sgpr32, 268, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr189, $sgpr32, 264, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr190, $sgpr32, 260, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr191, $sgpr32, 256, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr192, $sgpr32, 252, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr193, $sgpr32, 248, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr194, $sgpr32, 244, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr195, $sgpr32, 240, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr196, $sgpr32, 236, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr197, $sgpr32, 232, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr198, $sgpr32, 228, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr199, $sgpr32, 224, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr200, $sgpr32, 220, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr201, $sgpr32, 216, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr202, $sgpr32, 212, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr203, $sgpr32, 208, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr204, $sgpr32, 204, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr205, $sgpr32, 200, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr206, $sgpr32, 196, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr207, $sgpr32, 192, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr208, $sgpr32, 188, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr209, $sgpr32, 184, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr210, $sgpr32, 180, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr211, $sgpr32, 176, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr212, $sgpr32, 172, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr213, $sgpr32, 168, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr214, $sgpr32, 164, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr215, $sgpr32, 160, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr216, $sgpr32, 156, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr217, $sgpr32, 152, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr218, $sgpr32, 148, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr219, $sgpr32, 144, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr220, $sgpr32, 140, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr221, $sgpr32, 136, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr222, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr223, $sgpr32, 128, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr224, $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr225, $sgpr32, 120, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr226, $sgpr32, 116, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr227, $sgpr32, 112, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr228, $sgpr32, 108, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr229, $sgpr32, 104, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr230, $sgpr32, 100, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr231, $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr232, $sgpr32, 92, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr233, $sgpr32, 88, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr234, $sgpr32, 84, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr235, $sgpr32, 80, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr236, $sgpr32, 76, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr237, $sgpr32, 72, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr238, $sgpr32, 68, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr239, $sgpr32, 64, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr240, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr241, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr242, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr243, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr244, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr245, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr246, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr247, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr248, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr249, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr250, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr251, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr252, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr253, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr254, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr255, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 704, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-FLATSCR-NEXT: $vgpr40 = V_MOV_B32_e32 $sgpr32, implicit $exec
@@ -8167,7 +9959,7 @@ body: |
; GFX90A-LABEL: name: agpr64_save_clobber_scc
; GFX90A: bb.0:
; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -8314,229 +10106,453 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr80, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 700, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr81, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 696, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr82, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 692, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr83, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 688, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr84, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 684, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr85, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 680, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr86, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 676, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr87, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 672, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr88, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 668, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr89, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 664, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr90, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 660, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr91, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 656, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr92, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 652, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr93, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 648, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr94, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 644, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr95, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 640, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr96, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 636, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr97, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 632, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr98, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 628, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr99, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 624, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr100, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 620, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr101, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 616, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr102, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 612, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr103, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 608, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr104, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 604, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr105, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 600, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr106, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 596, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr107, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 592, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr108, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 588, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr109, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 584, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr110, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 580, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr111, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 576, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr112, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 572, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr113, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 568, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr114, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 564, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr115, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 560, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr116, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 556, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr117, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 552, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr118, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 548, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr119, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 544, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr120, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 540, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr121, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 536, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr122, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 532, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr123, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 528, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr124, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 524, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr125, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 520, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr126, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 516, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr127, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 512, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr128, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 508, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr129, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 504, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr130, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 500, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr131, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 496, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr132, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 492, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr133, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 488, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr134, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 484, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr135, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 480, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr136, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 476, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr137, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 472, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr138, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 468, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr139, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 464, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr140, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 460, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr141, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 456, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr142, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 452, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr143, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 448, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr144, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 444, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr145, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 440, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr146, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 436, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr147, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 432, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr148, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 428, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr149, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 424, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr150, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 420, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr151, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 416, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr152, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 412, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr153, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 408, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr154, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 404, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr155, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 400, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr156, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 396, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr157, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 392, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr158, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 388, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr159, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 384, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr160, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 380, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr161, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 376, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr162, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 372, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr163, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 368, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr164, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 364, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr165, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 360, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr166, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 356, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr167, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 352, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr168, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 348, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr169, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 344, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr170, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 340, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr171, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 336, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr172, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 332, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr173, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 328, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr174, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 324, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr175, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 320, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr176, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 316, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr177, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 312, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr178, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 308, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr179, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 304, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr180, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 300, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr181, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 296, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr182, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 292, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr183, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 288, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr184, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 284, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr185, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 280, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr186, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 276, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr187, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 272, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr188, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 268, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr189, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 264, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr190, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 260, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr191, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 256, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr192, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 252, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr193, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 248, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr194, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 244, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr195, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 240, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr196, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 236, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr197, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 232, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr198, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 228, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr199, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 224, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr200, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 220, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr201, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 216, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr202, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 212, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr203, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 208, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr204, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 204, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr205, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 200, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr206, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 196, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr207, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 192, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr208, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 188, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr209, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 184, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr210, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 180, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr211, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 176, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr212, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 172, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr213, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 168, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr214, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 164, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr215, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 160, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr216, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 156, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr217, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 152, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr218, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 148, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr219, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 144, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr220, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 140, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr221, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 136, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr222, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr223, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 128, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr224, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr225, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr226, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr227, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr228, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr229, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr230, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr231, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr232, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr233, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr234, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr235, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr236, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr237, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr238, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr239, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr240, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr241, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr242, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr243, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr244, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr245, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr246, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr247, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr248, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr249, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr250, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr251, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr252, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr253, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr254, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr255, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 704, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-NEXT: $vgpr40 = V_MOV_B32_e32 8904, implicit $exec
@@ -9129,7 +11145,7 @@ body: |
; GFX90A-FLATSCR-LABEL: name: agpr64_save_clobber_scc
; GFX90A-FLATSCR: bb.0:
; GFX90A-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-FLATSCR-NEXT: {{ $}}
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -9276,229 +11292,453 @@ body: |
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr80, $sgpr32, 700, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr81, $sgpr32, 696, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr82, $sgpr32, 692, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr83, $sgpr32, 688, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr84, $sgpr32, 684, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr85, $sgpr32, 680, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr86, $sgpr32, 676, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr87, $sgpr32, 672, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr88, $sgpr32, 668, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr89, $sgpr32, 664, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr90, $sgpr32, 660, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr91, $sgpr32, 656, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr92, $sgpr32, 652, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr93, $sgpr32, 648, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr94, $sgpr32, 644, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr95, $sgpr32, 640, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr96, $sgpr32, 636, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr97, $sgpr32, 632, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr98, $sgpr32, 628, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr99, $sgpr32, 624, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr100, $sgpr32, 620, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr101, $sgpr32, 616, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr102, $sgpr32, 612, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr103, $sgpr32, 608, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr104, $sgpr32, 604, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr105, $sgpr32, 600, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr106, $sgpr32, 596, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr107, $sgpr32, 592, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr108, $sgpr32, 588, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr109, $sgpr32, 584, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr110, $sgpr32, 580, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr111, $sgpr32, 576, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr112, $sgpr32, 572, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr113, $sgpr32, 568, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr114, $sgpr32, 564, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr115, $sgpr32, 560, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr116, $sgpr32, 556, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr117, $sgpr32, 552, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr118, $sgpr32, 548, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr119, $sgpr32, 544, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr120, $sgpr32, 540, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr121, $sgpr32, 536, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr122, $sgpr32, 532, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr123, $sgpr32, 528, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr124, $sgpr32, 524, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr125, $sgpr32, 520, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr126, $sgpr32, 516, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr127, $sgpr32, 512, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr128, $sgpr32, 508, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr129, $sgpr32, 504, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr130, $sgpr32, 500, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr131, $sgpr32, 496, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr132, $sgpr32, 492, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr133, $sgpr32, 488, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr134, $sgpr32, 484, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr135, $sgpr32, 480, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr136, $sgpr32, 476, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr137, $sgpr32, 472, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr138, $sgpr32, 468, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr139, $sgpr32, 464, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr140, $sgpr32, 460, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr141, $sgpr32, 456, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr142, $sgpr32, 452, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr143, $sgpr32, 448, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr144, $sgpr32, 444, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr145, $sgpr32, 440, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr146, $sgpr32, 436, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr147, $sgpr32, 432, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr148, $sgpr32, 428, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr149, $sgpr32, 424, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr150, $sgpr32, 420, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr151, $sgpr32, 416, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr152, $sgpr32, 412, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr153, $sgpr32, 408, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr154, $sgpr32, 404, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr155, $sgpr32, 400, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr156, $sgpr32, 396, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr157, $sgpr32, 392, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr158, $sgpr32, 388, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr159, $sgpr32, 384, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr160, $sgpr32, 380, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr161, $sgpr32, 376, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr162, $sgpr32, 372, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr163, $sgpr32, 368, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr164, $sgpr32, 364, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr165, $sgpr32, 360, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr166, $sgpr32, 356, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr167, $sgpr32, 352, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr168, $sgpr32, 348, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr169, $sgpr32, 344, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr170, $sgpr32, 340, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr171, $sgpr32, 336, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr172, $sgpr32, 332, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr173, $sgpr32, 328, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr174, $sgpr32, 324, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr175, $sgpr32, 320, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr176, $sgpr32, 316, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr177, $sgpr32, 312, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr178, $sgpr32, 308, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr179, $sgpr32, 304, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr180, $sgpr32, 300, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr181, $sgpr32, 296, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr182, $sgpr32, 292, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr183, $sgpr32, 288, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr184, $sgpr32, 284, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr185, $sgpr32, 280, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr186, $sgpr32, 276, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr187, $sgpr32, 272, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr188, $sgpr32, 268, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr189, $sgpr32, 264, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr190, $sgpr32, 260, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr191, $sgpr32, 256, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr192, $sgpr32, 252, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr193, $sgpr32, 248, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr194, $sgpr32, 244, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr195, $sgpr32, 240, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr196, $sgpr32, 236, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr197, $sgpr32, 232, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr198, $sgpr32, 228, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr199, $sgpr32, 224, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr200, $sgpr32, 220, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr201, $sgpr32, 216, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr202, $sgpr32, 212, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr203, $sgpr32, 208, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr204, $sgpr32, 204, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr205, $sgpr32, 200, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr206, $sgpr32, 196, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr207, $sgpr32, 192, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr208, $sgpr32, 188, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr209, $sgpr32, 184, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr210, $sgpr32, 180, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr211, $sgpr32, 176, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr212, $sgpr32, 172, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr213, $sgpr32, 168, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr214, $sgpr32, 164, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr215, $sgpr32, 160, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr216, $sgpr32, 156, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr217, $sgpr32, 152, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr218, $sgpr32, 148, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr219, $sgpr32, 144, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr220, $sgpr32, 140, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr221, $sgpr32, 136, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr222, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr223, $sgpr32, 128, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr224, $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr225, $sgpr32, 120, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr226, $sgpr32, 116, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr227, $sgpr32, 112, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr228, $sgpr32, 108, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr229, $sgpr32, 104, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr230, $sgpr32, 100, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr231, $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr232, $sgpr32, 92, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr233, $sgpr32, 88, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr234, $sgpr32, 84, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr235, $sgpr32, 80, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr236, $sgpr32, 76, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr237, $sgpr32, 72, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr238, $sgpr32, 68, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr239, $sgpr32, 64, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr240, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr241, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr242, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr243, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr244, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr245, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr246, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr247, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr248, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr249, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr250, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr251, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr252, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr253, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr254, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr255, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 704, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-FLATSCR-NEXT: $vgpr40 = V_MOV_B32_e32 $sgpr32, implicit $exec
@@ -10118,7 +12358,7 @@ body: |
; GFX90A-LABEL: name: agpr96_save_clobber_scc
; GFX90A: bb.0:
; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -10265,229 +12505,453 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr80, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 700, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr81, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 696, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr82, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 692, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr83, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 688, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr84, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 684, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr85, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 680, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr86, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 676, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr87, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 672, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr88, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 668, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr89, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 664, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr90, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 660, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr91, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 656, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr92, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 652, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr93, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 648, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr94, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 644, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr95, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 640, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr96, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 636, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr97, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 632, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr98, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 628, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr99, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 624, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr100, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 620, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr101, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 616, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr102, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 612, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr103, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 608, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr104, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 604, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr105, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 600, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr106, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 596, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr107, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 592, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr108, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 588, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr109, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 584, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr110, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 580, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr111, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 576, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr112, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 572, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr113, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 568, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr114, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 564, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr115, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 560, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr116, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 556, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr117, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 552, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr118, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 548, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr119, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 544, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr120, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 540, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr121, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 536, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr122, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 532, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr123, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 528, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr124, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 524, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr125, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 520, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr126, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 516, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr127, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 512, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr128, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 508, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr129, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 504, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr130, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 500, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr131, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 496, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr132, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 492, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr133, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 488, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr134, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 484, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr135, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 480, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr136, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 476, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr137, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 472, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr138, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 468, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr139, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 464, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr140, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 460, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr141, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 456, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr142, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 452, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr143, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 448, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr144, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 444, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr145, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 440, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr146, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 436, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr147, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 432, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr148, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 428, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr149, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 424, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr150, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 420, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr151, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 416, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr152, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 412, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr153, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 408, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr154, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 404, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr155, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 400, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr156, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 396, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr157, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 392, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr158, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 388, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr159, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 384, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr160, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 380, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr161, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 376, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr162, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 372, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr163, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 368, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr164, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 364, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr165, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 360, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr166, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 356, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr167, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 352, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr168, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 348, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr169, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 344, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr170, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 340, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr171, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 336, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr172, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 332, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr173, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 328, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr174, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 324, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr175, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 320, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr176, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 316, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr177, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 312, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr178, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 308, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr179, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 304, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr180, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 300, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr181, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 296, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr182, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 292, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr183, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 288, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr184, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 284, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr185, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 280, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr186, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 276, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr187, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 272, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr188, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 268, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr189, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 264, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr190, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 260, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr191, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 256, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr192, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 252, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr193, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 248, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr194, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 244, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr195, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 240, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr196, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 236, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr197, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 232, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr198, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 228, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr199, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 224, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr200, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 220, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr201, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 216, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr202, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 212, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr203, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 208, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr204, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 204, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr205, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 200, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr206, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 196, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr207, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 192, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr208, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 188, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr209, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 184, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr210, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 180, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr211, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 176, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr212, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 172, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr213, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 168, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr214, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 164, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr215, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 160, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr216, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 156, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr217, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 152, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr218, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 148, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr219, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 144, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr220, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 140, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr221, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 136, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr222, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr223, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 128, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr224, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr225, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr226, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr227, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr228, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr229, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr230, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr231, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr232, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr233, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr234, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr235, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr236, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr237, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr238, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr239, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr240, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr241, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr242, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr243, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr244, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr245, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr246, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr247, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr248, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr249, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr250, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr251, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr252, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr253, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr254, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr255, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 704, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-NEXT: $vgpr40 = V_MOV_B32_e32 8904, implicit $exec
@@ -11083,7 +13547,7 @@ body: |
; GFX90A-FLATSCR-LABEL: name: agpr96_save_clobber_scc
; GFX90A-FLATSCR: bb.0:
; GFX90A-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+ ; GFX90A-FLATSCR-NEXT: liveins: $agpr32, $agpr33, $agpr34, $agpr35, $agpr36, $agpr37, $agpr38, $agpr39, $agpr40, $agpr41, $agpr42, $agpr43, $agpr44, $agpr45, $agpr46, $agpr47, $agpr48, $agpr49, $agpr50, $agpr51, $agpr52, $agpr53, $agpr54, $agpr55, $agpr56, $agpr57, $agpr58, $agpr59, $agpr60, $agpr61, $agpr62, $agpr63, $agpr64, $agpr65, $agpr66, $agpr67, $agpr68, $agpr69, $agpr70, $agpr71, $agpr72, $agpr73, $agpr74, $agpr75, $agpr76, $agpr77, $agpr78, $agpr79, $agpr80, $agpr81, $agpr82, $agpr83, $agpr84, $agpr85, $agpr86, $agpr87, $agpr88, $agpr89, $agpr90, $agpr91, $agpr92, $agpr93, $agpr94, $agpr95, $agpr96, $agpr97, $agpr98, $agpr99, $agpr100, $agpr101, $agpr102, $agpr103, $agpr104, $agpr105, $agpr106, $agpr107, $agpr108, $agpr109, $agpr110, $agpr111, $agpr112, $agpr113, $agpr114, $agpr115, $agpr116, $agpr117, $agpr118, $agpr119, $agpr120, $agpr121, $agpr122, $agpr123, $agpr124, $agpr125, $agpr126, $agpr127, $agpr128, $agpr129, $agpr130, $agpr131, $agpr132, $agpr133, $agpr134, $agpr135, $agpr136, $agpr137, $agpr138, $agpr139, $agpr140, $agpr141, $agpr142, $agpr143, $agpr144, $agpr145, $agpr146, $agpr147, $agpr148, $agpr149, $agpr150, $agpr151, $agpr152, $agpr153, $agpr154, $agpr155, $agpr156, $agpr157, $agpr158, $agpr159, $agpr160, $agpr161, $agpr162, $agpr163, $agpr164, $agpr165, $agpr166, $agpr167, $agpr168, $agpr169, $agpr170, $agpr171, $agpr172, $agpr173, $agpr174, $agpr175, $agpr176, $agpr177, $agpr178, $agpr179, $agpr180, $agpr181, $agpr182, $agpr183, $agpr184, $agpr185, $agpr186, $agpr187, $agpr188, $agpr189, $agpr190, $agpr191, $agpr192, $agpr193, $agpr194, $agpr195, $agpr196, $agpr197, $agpr198, $agpr199, $agpr200, $agpr201, $agpr202, $agpr203, $agpr204, $agpr205, $agpr206, $agpr207, $agpr208, $agpr209, $agpr210, $agpr211, $agpr212, $agpr213, $agpr214, $agpr215, $agpr216, $agpr217, $agpr218, $agpr219, $agpr220, $agpr221, $agpr222, $agpr223, $agpr224, $agpr225, $agpr226, $agpr227, $agpr228, $agpr229, $agpr230, $agpr231, $agpr232, $agpr233, $agpr234, $agpr235, $agpr236, $agpr237, $agpr238, $agpr239, $agpr240, $agpr241, $agpr242, $agpr243, $agpr244, $agpr245, $agpr246, $agpr247, $agpr248, $agpr249, $agpr250, $agpr251, $agpr252, $agpr253, $agpr254, $agpr255, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX90A-FLATSCR-NEXT: {{ $}}
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -11230,229 +13694,453 @@ body: |
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94
; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95
; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr33, $vgpr1, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr34, $vgpr2, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr35, $vgpr3, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr36, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr36, $vgpr4, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr37, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr37, $vgpr5, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr38, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr38, $vgpr6, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr39, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr39, $vgpr7, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr40, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr40, $vgpr8, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr41, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr41, $vgpr9, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr42, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr42, $vgpr10, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr43, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr43, $vgpr11, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr44, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr44, $vgpr12, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr45, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr45, $vgpr13, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr46, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr46, $vgpr14, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr47, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr47, $vgpr15, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr48, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr48, $vgpr16, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr49, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr49, $vgpr17, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr50, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr50, $vgpr18, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr51, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr51, $vgpr19, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr52, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr52, $vgpr20, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr53, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr53, $vgpr21, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr54, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr54, $vgpr22, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr55, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr55, $vgpr23, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr56, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr56, $vgpr24, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr57, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr57, $vgpr25, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr58, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr58, $vgpr26, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr59, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr59, $vgpr27, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr60, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr60, $vgpr28, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr61, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr61, $vgpr29, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr62, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr62, $vgpr30, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr63, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr63, $vgpr31, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr32 = V_ACCVGPR_READ_B32_e64 killed $agpr64, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr64, $vgpr32, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr33 = V_ACCVGPR_READ_B32_e64 killed $agpr65, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr65, $vgpr33, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr34 = V_ACCVGPR_READ_B32_e64 killed $agpr66, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr66, $vgpr34, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr35 = V_ACCVGPR_READ_B32_e64 killed $agpr67, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr67, $vgpr35, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr36 = V_ACCVGPR_READ_B32_e64 killed $agpr68, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr68, $vgpr36, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr37 = V_ACCVGPR_READ_B32_e64 killed $agpr69, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr69, $vgpr37, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr38 = V_ACCVGPR_READ_B32_e64 killed $agpr70, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr70, $vgpr38, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr39 = V_ACCVGPR_READ_B32_e64 killed $agpr71, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr71, $vgpr39, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr48 = V_ACCVGPR_READ_B32_e64 killed $agpr72, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr72, $vgpr48, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr49 = V_ACCVGPR_READ_B32_e64 killed $agpr73, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr73, $vgpr49, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr50 = V_ACCVGPR_READ_B32_e64 killed $agpr74, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr74, $vgpr50, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr51 = V_ACCVGPR_READ_B32_e64 killed $agpr75, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr75, $vgpr51, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr52 = V_ACCVGPR_READ_B32_e64 killed $agpr76, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr76, $vgpr52, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr77, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr77, $vgpr53, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr78, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr78, $vgpr54, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr79, implicit $exec
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr79, $vgpr55, 32, $exec, 64
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr80, $sgpr32, 700, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.50, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr80, 32, $exec, 64, 44800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr81, $sgpr32, 696, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.51, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr81, 32, $exec, 64, 44544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr82, $sgpr32, 692, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.52, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr82, 32, $exec, 64, 44288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr83, $sgpr32, 688, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.53, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr83, 32, $exec, 64, 44032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr84, $sgpr32, 684, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.54, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr84, 32, $exec, 64, 43776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr85, $sgpr32, 680, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.55, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr85, 32, $exec, 64, 43520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr86, $sgpr32, 676, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.56, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr86, 32, $exec, 64, 43264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr87, $sgpr32, 672, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.57, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr87, 32, $exec, 64, 43008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr88, $sgpr32, 668, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.58, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr88, 32, $exec, 64, 42752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr89, $sgpr32, 664, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.59, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr89, 32, $exec, 64, 42496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr90, $sgpr32, 660, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.60, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr90, 32, $exec, 64, 42240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr91, $sgpr32, 656, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.61, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr91, 32, $exec, 64, 41984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr92, $sgpr32, 652, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.62, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr92, 32, $exec, 64, 41728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr93, $sgpr32, 648, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.63, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr93, 32, $exec, 64, 41472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr94, $sgpr32, 644, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.64, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr94, 32, $exec, 64, 41216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr95, $sgpr32, 640, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.65, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr95, 32, $exec, 64, 40960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr96, $sgpr32, 636, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.66, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr96, 32, $exec, 64, 40704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr97, $sgpr32, 632, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.67, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr97, 32, $exec, 64, 40448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr98, $sgpr32, 628, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr98, 32, $exec, 64, 40192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr99, $sgpr32, 624, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr99, 32, $exec, 64, 39936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr100, $sgpr32, 620, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr100, 32, $exec, 64, 39680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr101, $sgpr32, 616, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr101, 32, $exec, 64, 39424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr102, $sgpr32, 612, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr102, 32, $exec, 64, 39168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr103, $sgpr32, 608, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr103, 32, $exec, 64, 38912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr104, $sgpr32, 604, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.74, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr104, 32, $exec, 64, 38656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr105, $sgpr32, 600, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.75, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr105, 32, $exec, 64, 38400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr106, $sgpr32, 596, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.76, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr106, 32, $exec, 64, 38144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr107, $sgpr32, 592, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.77, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr107, 32, $exec, 64, 37888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr108, $sgpr32, 588, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.78, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr108, 32, $exec, 64, 37632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr109, $sgpr32, 584, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.79, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr109, 32, $exec, 64, 37376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr110, $sgpr32, 580, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.80, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr110, 32, $exec, 64, 37120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr111, $sgpr32, 576, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.81, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr111, 32, $exec, 64, 36864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr112, $sgpr32, 572, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.82, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr112, 32, $exec, 64, 36608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr113, $sgpr32, 568, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.83, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr113, 32, $exec, 64, 36352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr114, $sgpr32, 564, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.84, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr114, 32, $exec, 64, 36096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr115, $sgpr32, 560, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.85, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr115, 32, $exec, 64, 35840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr116, $sgpr32, 556, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.86, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr116, 32, $exec, 64, 35584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr117, $sgpr32, 552, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.87, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr117, 32, $exec, 64, 35328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr118, $sgpr32, 548, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.88, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr118, 32, $exec, 64, 35072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr119, $sgpr32, 544, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.89, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr119, 32, $exec, 64, 34816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr120, $sgpr32, 540, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.90, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr120, 32, $exec, 64, 34560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr121, $sgpr32, 536, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.91, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr121, 32, $exec, 64, 34304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr122, $sgpr32, 532, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.92, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr122, 32, $exec, 64, 34048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr123, $sgpr32, 528, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.93, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr123, 32, $exec, 64, 33792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr124, $sgpr32, 524, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.94, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr124, 32, $exec, 64, 33536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr125, $sgpr32, 520, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.95, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr125, 32, $exec, 64, 33280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr126, $sgpr32, 516, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.96, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr126, 32, $exec, 64, 33024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr127, $sgpr32, 512, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.97, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr127, 32, $exec, 64, 32768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr128, $sgpr32, 508, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.98, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr128, 32, $exec, 64, 32512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr129, $sgpr32, 504, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.99, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr129, 32, $exec, 64, 32256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr130, $sgpr32, 500, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.100, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr130, 32, $exec, 64, 32000
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr131, $sgpr32, 496, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.101, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr131, 32, $exec, 64, 31744
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr132, $sgpr32, 492, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.102, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr132, 32, $exec, 64, 31488
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr133, $sgpr32, 488, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.103, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr133, 32, $exec, 64, 31232
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr134, $sgpr32, 484, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.104, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr134, 32, $exec, 64, 30976
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr135, $sgpr32, 480, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.105, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr135, 32, $exec, 64, 30720
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr136, $sgpr32, 476, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.106, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr136, 32, $exec, 64, 30464
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr137, $sgpr32, 472, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.107, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr137, 32, $exec, 64, 30208
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr138, $sgpr32, 468, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.108, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr138, 32, $exec, 64, 29952
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr139, $sgpr32, 464, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.109, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr139, 32, $exec, 64, 29696
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr140, $sgpr32, 460, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.110, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr140, 32, $exec, 64, 29440
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr141, $sgpr32, 456, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.111, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr141, 32, $exec, 64, 29184
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr142, $sgpr32, 452, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.112, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr142, 32, $exec, 64, 28928
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr143, $sgpr32, 448, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.113, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr143, 32, $exec, 64, 28672
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr144, $sgpr32, 444, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.114, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr144, 32, $exec, 64, 28416
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr145, $sgpr32, 440, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.115, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr145, 32, $exec, 64, 28160
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr146, $sgpr32, 436, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.116, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr146, 32, $exec, 64, 27904
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr147, $sgpr32, 432, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.117, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr147, 32, $exec, 64, 27648
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr148, $sgpr32, 428, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.118, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr148, 32, $exec, 64, 27392
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr149, $sgpr32, 424, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.119, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr149, 32, $exec, 64, 27136
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr150, $sgpr32, 420, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.120, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr150, 32, $exec, 64, 26880
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr151, $sgpr32, 416, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.121, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr151, 32, $exec, 64, 26624
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr152, $sgpr32, 412, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.122, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr152, 32, $exec, 64, 26368
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr153, $sgpr32, 408, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.123, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr153, 32, $exec, 64, 26112
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr154, $sgpr32, 404, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.124, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr154, 32, $exec, 64, 25856
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr155, $sgpr32, 400, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.125, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr155, 32, $exec, 64, 25600
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr156, $sgpr32, 396, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.126, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr156, 32, $exec, 64, 25344
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr157, $sgpr32, 392, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.127, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr157, 32, $exec, 64, 25088
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr158, $sgpr32, 388, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.128, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr158, 32, $exec, 64, 24832
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr159, $sgpr32, 384, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.129, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr159, 32, $exec, 64, 24576
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr160, $sgpr32, 380, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.130, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr160, 32, $exec, 64, 24320
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr161, $sgpr32, 376, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.131, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr161, 32, $exec, 64, 24064
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr162, $sgpr32, 372, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.132, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr162, 32, $exec, 64, 23808
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr163, $sgpr32, 368, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.133, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr163, 32, $exec, 64, 23552
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr164, $sgpr32, 364, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.134, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr164, 32, $exec, 64, 23296
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr165, $sgpr32, 360, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.135, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr165, 32, $exec, 64, 23040
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr166, $sgpr32, 356, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.136, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr166, 32, $exec, 64, 22784
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr167, $sgpr32, 352, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.137, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr167, 32, $exec, 64, 22528
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr168, $sgpr32, 348, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.138, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr168, 32, $exec, 64, 22272
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr169, $sgpr32, 344, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.139, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr169, 32, $exec, 64, 22016
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr170, $sgpr32, 340, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.140, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr170, 32, $exec, 64, 21760
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr171, $sgpr32, 336, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.141, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr171, 32, $exec, 64, 21504
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr172, $sgpr32, 332, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.142, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr172, 32, $exec, 64, 21248
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr173, $sgpr32, 328, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.143, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr173, 32, $exec, 64, 20992
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr174, $sgpr32, 324, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.144, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr174, 32, $exec, 64, 20736
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr175, $sgpr32, 320, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.145, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr175, 32, $exec, 64, 20480
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr176, $sgpr32, 316, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.146, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr176, 32, $exec, 64, 20224
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr177, $sgpr32, 312, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.147, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr177, 32, $exec, 64, 19968
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr178, $sgpr32, 308, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.148, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr178, 32, $exec, 64, 19712
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr179, $sgpr32, 304, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.149, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr179, 32, $exec, 64, 19456
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr180, $sgpr32, 300, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.150, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr180, 32, $exec, 64, 19200
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr181, $sgpr32, 296, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.151, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr181, 32, $exec, 64, 18944
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr182, $sgpr32, 292, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.152, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr182, 32, $exec, 64, 18688
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr183, $sgpr32, 288, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.153, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr183, 32, $exec, 64, 18432
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr184, $sgpr32, 284, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.154, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr184, 32, $exec, 64, 18176
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr185, $sgpr32, 280, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.155, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr185, 32, $exec, 64, 17920
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr186, $sgpr32, 276, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.156, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr186, 32, $exec, 64, 17664
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr187, $sgpr32, 272, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.157, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr187, 32, $exec, 64, 17408
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr188, $sgpr32, 268, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.158, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr188, 32, $exec, 64, 17152
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr189, $sgpr32, 264, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.159, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr189, 32, $exec, 64, 16896
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr190, $sgpr32, 260, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.160, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr190, 32, $exec, 64, 16640
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr191, $sgpr32, 256, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.161, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr191, 32, $exec, 64, 16384
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr192, $sgpr32, 252, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.162, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr192, 32, $exec, 64, 16128
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr193, $sgpr32, 248, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.163, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr193, 32, $exec, 64, 15872
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr194, $sgpr32, 244, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.164, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr194, 32, $exec, 64, 15616
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr195, $sgpr32, 240, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.165, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr195, 32, $exec, 64, 15360
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr196, $sgpr32, 236, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.166, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr196, 32, $exec, 64, 15104
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr197, $sgpr32, 232, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.167, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr197, 32, $exec, 64, 14848
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr198, $sgpr32, 228, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.168, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr198, 32, $exec, 64, 14592
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr199, $sgpr32, 224, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.169, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr199, 32, $exec, 64, 14336
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr200, $sgpr32, 220, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.170, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr200, 32, $exec, 64, 14080
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr201, $sgpr32, 216, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.171, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr201, 32, $exec, 64, 13824
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr202, $sgpr32, 212, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.172, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr202, 32, $exec, 64, 13568
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr203, $sgpr32, 208, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.173, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr203, 32, $exec, 64, 13312
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr204, $sgpr32, 204, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.174, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr204, 32, $exec, 64, 13056
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr205, $sgpr32, 200, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.175, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr205, 32, $exec, 64, 12800
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr206, $sgpr32, 196, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.176, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr206, 32, $exec, 64, 12544
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr207, $sgpr32, 192, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.177, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr207, 32, $exec, 64, 12288
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr208, $sgpr32, 188, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.178, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr208, 32, $exec, 64, 12032
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr209, $sgpr32, 184, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.179, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr209, 32, $exec, 64, 11776
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr210, $sgpr32, 180, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.180, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr210, 32, $exec, 64, 11520
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr211, $sgpr32, 176, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.181, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr211, 32, $exec, 64, 11264
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr212, $sgpr32, 172, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.182, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr212, 32, $exec, 64, 11008
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr213, $sgpr32, 168, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.183, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr213, 32, $exec, 64, 10752
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr214, $sgpr32, 164, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.184, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr214, 32, $exec, 64, 10496
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr215, $sgpr32, 160, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.185, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr215, 32, $exec, 64, 10240
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr216, $sgpr32, 156, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.186, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr216, 32, $exec, 64, 9984
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr217, $sgpr32, 152, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.187, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr217, 32, $exec, 64, 9728
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr218, $sgpr32, 148, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.188, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr218, 32, $exec, 64, 9472
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr219, $sgpr32, 144, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.189, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr219, 32, $exec, 64, 9216
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr220, $sgpr32, 140, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.190, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr220, 32, $exec, 64, 8960
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr221, $sgpr32, 136, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.191, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr221, 32, $exec, 64, 8704
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr222, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.192, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr222, 32, $exec, 64, 8448
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr223, $sgpr32, 128, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.193, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr223, 32, $exec, 64, 8192
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr224, $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.194, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr224, 32, $exec, 64, 7936
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr225, $sgpr32, 120, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.195, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr225, 32, $exec, 64, 7680
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr226, $sgpr32, 116, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.196, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr226, 32, $exec, 64, 7424
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr227, $sgpr32, 112, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.197, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr227, 32, $exec, 64, 7168
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr228, $sgpr32, 108, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.198, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr228, 32, $exec, 64, 6912
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr229, $sgpr32, 104, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.199, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr229, 32, $exec, 64, 6656
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr230, $sgpr32, 100, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.200, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr230, 32, $exec, 64, 6400
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr231, $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.201, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr231, 32, $exec, 64, 6144
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr232, $sgpr32, 92, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.202, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr232, 32, $exec, 64, 5888
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr233, $sgpr32, 88, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.203, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr233, 32, $exec, 64, 5632
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr234, $sgpr32, 84, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.204, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr234, 32, $exec, 64, 5376
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr235, $sgpr32, 80, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.205, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr235, 32, $exec, 64, 5120
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr236, $sgpr32, 76, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.206, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr236, 32, $exec, 64, 4864
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr237, $sgpr32, 72, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.207, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr237, 32, $exec, 64, 4608
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr238, $sgpr32, 68, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.208, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr238, 32, $exec, 64, 4352
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr239, $sgpr32, 64, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.209, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr239, 32, $exec, 64, 4096
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr240, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.210, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr240, 32, $exec, 64, 3840
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr241, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.211, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr241, 32, $exec, 64, 3584
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr242, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.212, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr242, 32, $exec, 64, 3328
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr243, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.213, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr243, 32, $exec, 64, 3072
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr244, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.214, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr244, 32, $exec, 64, 2816
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr245, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.215, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr245, 32, $exec, 64, 2560
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr246, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.216, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr246, 32, $exec, 64, 2304
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr247, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.217, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr247, 32, $exec, 64, 2048
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr248, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.218, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr248, 32, $exec, 64, 1792
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr249, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.219, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr249, 32, $exec, 64, 1536
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr250, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.220, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr250, 32, $exec, 64, 1280
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr251, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.221, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr251, 32, $exec, 64, 1024
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr252, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.222, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr252, 32, $exec, 64, 768
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr253, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.223, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr253, 32, $exec, 64, 512
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr254, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.224, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr254, 32, $exec, 64, 256
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr255, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.225, addrspace 5)
+ ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $agpr255, 32, $exec, 64, 0
; GFX90A-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GFX90A-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 704, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.226, addrspace 5)
; GFX90A-FLATSCR-NEXT: $vgpr40 = V_MOV_B32_e32 $sgpr32, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index daf7fed3731f5..727728145dfe0 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -451,6 +451,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX90A-LABEL: v32_asm_def_use:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v35, a32 ; Reload Reuse
; GFX90A-NEXT: v_mov_b32_e32 v34, v0
; GFX90A-NEXT: v_mov_b32_e32 v33, v1
; GFX90A-NEXT: ;;#ASMSTART
@@ -478,8 +479,8 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v35, a32 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a1
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; copy
@@ -1054,6 +1055,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
; GFX90A-NEXT: v_mov_b32_e32 v33, v0
; GFX90A-NEXT: v_mov_b32_e32 v32, v1
; GFX90A-NEXT: ;;#ASMSTART
@@ -1075,8 +1077,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
@@ -1139,9 +1140,9 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
declare i32 @llvm.amdgcn.workitem.id.x() #2
-attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
+attributes #0 = { "amdgpu-waves-per-eu"="6,6" nounwind }
attributes #1 = { convergent nounwind readnone willreturn }
attributes #2 = { nounwind readnone willreturn }
-attributes #3 = { "amdgpu-waves-per-eu"="7,7" "amdgpu-agpr-alloc"="0" }
-attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" }
-attributes #5 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-agpr-alloc"="0" }
+attributes #3 = { "amdgpu-waves-per-eu"="7,7" "amdgpu-agpr-alloc"="0" nounwind }
+attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" nounwind }
+attributes #5 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-agpr-alloc"="0" nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index 60ce2ce2d99ae..86740423e09ba 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -19,16 +19,17 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2
+; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; DAGISEL-NEXT: ; meta instruction
; DAGISEL-NEXT: scratch_store_b32 off, v41, s33
+; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
+; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
-; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
; DAGISEL-NEXT: s_mov_b32 s1, good_callee at abs32@hi
; DAGISEL-NEXT: s_mov_b32 s0, good_callee at abs32@lo
-; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
-; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off
@@ -62,16 +63,17 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_mov_b32 exec_lo, s1
; GISEL-NEXT: v_writelane_b32 v42, s0, 2
+; GISEL-NEXT: s_add_co_i32 s32, s32, 16
; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GISEL-NEXT: ; meta instruction
; GISEL-NEXT: scratch_store_b32 off, v41, s33
+; GISEL-NEXT: v_writelane_b32 v42, s30, 0
+; GISEL-NEXT: v_writelane_b32 v42, s31, 1
; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
-; GISEL-NEXT: v_writelane_b32 v42, s30, 0
; GISEL-NEXT: s_mov_b32 s0, good_callee at abs32@lo
; GISEL-NEXT: s_mov_b32 s1, good_callee at abs32@hi
-; GISEL-NEXT: s_add_co_i32 s32, s32, 16
-; GISEL-NEXT: v_writelane_b32 v42, s31, 1
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: global_store_b32 v[40:41], v0, off
@@ -779,14 +781,13 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2
-; DAGISEL-NEXT: s_mov_b32 s1, void_callee at abs32@hi
-; DAGISEL-NEXT: s_mov_b32 s0, void_callee at abs32@lo
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0
; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1
+; DAGISEL-NEXT: s_mov_b32 s1, void_callee at abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, void_callee at abs32@lo
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
; DAGISEL-NEXT: s_mov_b32 s32, s33
@@ -814,14 +815,13 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_mov_b32 exec_lo, s1
; GISEL-NEXT: v_writelane_b32 v40, s0, 2
-; GISEL-NEXT: s_mov_b32 s0, void_callee at abs32@lo
-; GISEL-NEXT: s_mov_b32 s1, void_callee at abs32@hi
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
+; GISEL-NEXT: s_mov_b32 s0, void_callee at abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, void_callee at abs32@hi
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 5a83334419285..e00ab0f11dac4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -837,8 +837,8 @@ define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s64, 12
; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: v_writelane_b32 v32, s66, 14
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v32, s67, 15
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -988,8 +988,8 @@ define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s64, 12
; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: v_writelane_b32 v32, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -1139,8 +1139,8 @@ define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -1276,56 +1276,56 @@ define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -3417,8 +3417,8 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s64, 12
; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: v_writelane_b32 v32, s66, 14
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v32, s67, 15
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -3552,8 +3552,8 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s64, 12
; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: v_writelane_b32 v32, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -3687,8 +3687,8 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -3808,56 +3808,56 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -3942,6 +3942,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i32_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -4074,22 +4090,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -4121,13 +4121,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB12_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -4359,6 +4360,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: s_cbranch_execz .LBB12_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -5152,6 +5154,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v32i32_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -5260,22 +5278,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -6068,6 +6070,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v32i32_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -6175,23 +6193,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; kill: killed $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr39
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; kill: killed $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr35
@@ -6228,6 +6229,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
@@ -6271,7 +6273,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -6282,7 +6284,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -6468,7 +6470,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: s_cbranch_execz .LBB12_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_u32_e32 v32, 3, v32
-; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: v_add_u32_e32 v31, 3, v31
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_add_u32_e32 v30, 3, v30
@@ -7366,31 +7368,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) #0 {
; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -7919,8 +7940,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_writelane_b32 v20, s97, 31
; SI-NEXT: v_writelane_b32 v20, s98, 32
; SI-NEXT: v_writelane_b32 v20, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_writelane_b32 v20, s30, 34
+; SI-NEXT: v_writelane_b32 v20, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_readfirstlane_b32 s5, v18
; SI-NEXT: v_readfirstlane_b32 s4, v17
; SI-NEXT: v_readfirstlane_b32 s7, v16
@@ -7940,7 +7962,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s45, v2
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v1
-; SI-NEXT: v_writelane_b32 v20, s31, 35
; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB13_4
@@ -9079,8 +9100,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: v_writelane_b32 v32, s87, 29
-; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_readfirstlane_b32 s5, v18
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: v_readfirstlane_b32 s7, v16
@@ -9100,7 +9122,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s45, v2
; VI-NEXT: s_cmp_lg_u32 s44, 0
; VI-NEXT: v_readfirstlane_b32 s44, v1
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB13_4
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -9997,8 +10018,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v29, s97, 31
; GFX9-NEXT: v_writelane_b32 v29, s98, 32
; GFX9-NEXT: v_writelane_b32 v29, s99, 33
-; GFX9-NEXT: v_readfirstlane_b32 s44, v19
; GFX9-NEXT: v_writelane_b32 v29, s30, 34
+; GFX9-NEXT: v_writelane_b32 v29, s31, 35
+; GFX9-NEXT: v_readfirstlane_b32 s44, v19
; GFX9-NEXT: v_readfirstlane_b32 s5, v18
; GFX9-NEXT: v_readfirstlane_b32 s4, v17
; GFX9-NEXT: v_readfirstlane_b32 s7, v16
@@ -10018,7 +10040,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s45, v2
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
-; GFX9-NEXT: v_writelane_b32 v29, s31, 35
; GFX9-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB13_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -10826,42 +10847,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v27, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v24, s34, 0
-; GFX11-NEXT: v_writelane_b32 v25, s98, 0
-; GFX11-NEXT: v_readfirstlane_b32 s42, v15
-; GFX11-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-NEXT: v_readfirstlane_b32 s4, v13
; GFX11-NEXT: v_writelane_b32 v24, s35, 1
-; GFX11-NEXT: v_writelane_b32 v25, s99, 1
-; GFX11-NEXT: v_readfirstlane_b32 s7, v12
-; GFX11-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-NEXT: v_readfirstlane_b32 s9, v10
; GFX11-NEXT: v_writelane_b32 v24, s36, 2
-; GFX11-NEXT: v_writelane_b32 v25, s100, 2
-; GFX11-NEXT: v_readfirstlane_b32 s8, v9
-; GFX11-NEXT: v_readfirstlane_b32 s11, v8
-; GFX11-NEXT: v_readfirstlane_b32 s10, v7
; GFX11-NEXT: v_writelane_b32 v24, s37, 3
-; GFX11-NEXT: v_writelane_b32 v25, s101, 3
-; GFX11-NEXT: v_readfirstlane_b32 s13, v6
-; GFX11-NEXT: v_readfirstlane_b32 s12, v5
-; GFX11-NEXT: v_readfirstlane_b32 s15, v4
; GFX11-NEXT: v_writelane_b32 v24, s38, 4
-; GFX11-NEXT: v_writelane_b32 v25, s102, 4
-; GFX11-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-NEXT: v_readfirstlane_b32 s41, v2
-; GFX11-NEXT: v_readfirstlane_b32 s40, v1
; GFX11-NEXT: v_writelane_b32 v24, s39, 5
-; GFX11-NEXT: v_writelane_b32 v25, s103, 5
-; GFX11-NEXT: s_cmp_lg_u32 s42, 0
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
-; GFX11-NEXT: ; implicit-def: $vgpr27 : SGPR spill to VGPR lane
-; GFX11-NEXT: ; implicit-def: $vgpr26 : SGPR spill to VGPR lane
; GFX11-NEXT: v_writelane_b32 v24, s48, 6
-; GFX11-NEXT: v_writelane_b32 v25, s104, 6
; GFX11-NEXT: v_writelane_b32 v24, s49, 7
-; GFX11-NEXT: v_writelane_b32 v25, s30, 7
; GFX11-NEXT: v_writelane_b32 v24, s50, 8
-; GFX11-NEXT: v_writelane_b32 v25, s31, 8
; GFX11-NEXT: v_writelane_b32 v24, s51, 9
; GFX11-NEXT: v_writelane_b32 v24, s52, 10
; GFX11-NEXT: v_writelane_b32 v24, s53, 11
@@ -10885,6 +10878,34 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_writelane_b32 v24, s87, 29
; GFX11-NEXT: v_writelane_b32 v24, s96, 30
; GFX11-NEXT: v_writelane_b32 v24, s97, 31
+; GFX11-NEXT: v_writelane_b32 v25, s98, 0
+; GFX11-NEXT: v_writelane_b32 v25, s99, 1
+; GFX11-NEXT: v_writelane_b32 v25, s100, 2
+; GFX11-NEXT: v_writelane_b32 v25, s101, 3
+; GFX11-NEXT: v_writelane_b32 v25, s102, 4
+; GFX11-NEXT: v_writelane_b32 v25, s103, 5
+; GFX11-NEXT: v_writelane_b32 v25, s104, 6
+; GFX11-NEXT: v_writelane_b32 v25, s30, 7
+; GFX11-NEXT: v_writelane_b32 v25, s31, 8
+; GFX11-NEXT: v_readfirstlane_b32 s42, v15
+; GFX11-NEXT: v_readfirstlane_b32 s5, v14
+; GFX11-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-NEXT: v_readfirstlane_b32 s6, v11
+; GFX11-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-NEXT: v_readfirstlane_b32 s11, v8
+; GFX11-NEXT: v_readfirstlane_b32 s10, v7
+; GFX11-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-NEXT: v_readfirstlane_b32 s12, v5
+; GFX11-NEXT: v_readfirstlane_b32 s15, v4
+; GFX11-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-NEXT: v_readfirstlane_b32 s41, v2
+; GFX11-NEXT: v_readfirstlane_b32 s40, v1
+; GFX11-NEXT: s_cmp_lg_u32 s42, 0
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-NEXT: ; implicit-def: $vgpr27 : SGPR spill to VGPR lane
+; GFX11-NEXT: ; implicit-def: $vgpr26 : SGPR spill to VGPR lane
; GFX11-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s5, 24
@@ -15695,53 +15716,99 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:384
@@ -16494,53 +16561,99 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
@@ -17389,8 +17502,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; SI-LABEL: bitcast_v128i8_to_v32i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s76, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -17407,6 +17518,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_readfirstlane_b32 s76, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; SI-NEXT: v_readfirstlane_b32 s46, v30
; SI-NEXT: v_readfirstlane_b32 s6, v29
; SI-NEXT: v_readfirstlane_b32 s7, v28
@@ -18532,15 +18645,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-LABEL: bitcast_v128i8_to_v32i32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s45, v15
-; VI-NEXT: v_readfirstlane_b32 s46, v14
-; VI-NEXT: v_readfirstlane_b32 s47, v13
-; VI-NEXT: v_readfirstlane_b32 s76, v0
-; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -18557,6 +18661,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_readfirstlane_b32 s45, v15
+; VI-NEXT: v_readfirstlane_b32 s46, v14
+; VI-NEXT: v_readfirstlane_b32 s47, v13
+; VI-NEXT: v_readfirstlane_b32 s76, v0
+; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: v_readfirstlane_b32 s6, v30
; VI-NEXT: v_readfirstlane_b32 s7, v29
; VI-NEXT: v_readfirstlane_b32 s8, v28
@@ -18584,7 +18697,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s73, v3
; VI-NEXT: v_readfirstlane_b32 s74, v2
; VI-NEXT: v_readfirstlane_b32 s75, v1
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -19417,16 +19530,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-LABEL: bitcast_v128i8_to_v32i32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s45, v15
-; GFX9-NEXT: v_readfirstlane_b32 s46, v14
-; GFX9-NEXT: v_readfirstlane_b32 s47, v13
-; GFX9-NEXT: v_readfirstlane_b32 s76, v0
-; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -19443,6 +19546,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_readfirstlane_b32 s45, v15
+; GFX9-NEXT: v_readfirstlane_b32 s46, v14
+; GFX9-NEXT: v_readfirstlane_b32 s47, v13
+; GFX9-NEXT: v_readfirstlane_b32 s76, v0
+; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; GFX9-NEXT: v_readfirstlane_b32 s6, v30
; GFX9-NEXT: v_readfirstlane_b32 s7, v29
; GFX9-NEXT: v_readfirstlane_b32 s8, v28
@@ -19470,7 +19582,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s73, v3
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s75, v1
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -20313,6 +20425,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-TRUE16-NEXT: ; meta instruction
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
@@ -20428,9 +20544,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s74, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s75, 0
-; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -20886,6 +20999,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-FAKE16-NEXT: ; meta instruction
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-FAKE16-NEXT: s_clause 0x1f
; GFX11-FAKE16-NEXT: scratch_load_u16 v32, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v34, off, s32 offset:308
@@ -21001,9 +21118,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s74, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s75, 0
-; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -22314,8 +22428,9 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: v_writelane_b32 v33, s97, 31
; SI-NEXT: v_writelane_b32 v33, s98, 32
; SI-NEXT: v_writelane_b32 v33, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v33, s30, 34
+; SI-NEXT: v_writelane_b32 v33, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_readfirstlane_b32 s70, v17
; SI-NEXT: v_readfirstlane_b32 s71, v16
; SI-NEXT: v_readfirstlane_b32 s80, v15
@@ -22335,7 +22450,6 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: v_readfirstlane_b32 s8, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s9, v0
-; SI-NEXT: v_writelane_b32 v33, s31, 35
; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB17_2
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -23080,8 +23194,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64bf16_to_v32i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -23098,6 +23210,8 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0
; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v1
@@ -23182,8 +23296,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v54
; SI-NEXT: v_mul_f32_e32 v60, 1.0, v53
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v37
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38
; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -23268,7 +23384,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18
; SI-NEXT: v_mul_f32_e32 v58, 1.0, v52
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51
@@ -24895,26 +25010,41 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) #0 {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -26075,14 +26205,28 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
; SI-NEXT: v_writelane_b32 v63, s37, 3
; SI-NEXT: v_writelane_b32 v63, s30, 4
-; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: v_writelane_b32 v63, s31, 5
+; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: s_and_b32 s12, s25, 0xffff0000
; SI-NEXT: s_and_b32 s30, vcc_lo, 0xffff0000
; SI-NEXT: s_lshl_b32 s31, vcc_lo, 16
@@ -26217,27 +26361,11 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s43
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v53, 1.0, s40
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s29
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s28
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v62, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s26
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25
@@ -26915,8 +27043,8 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_writelane_b32 v35, s64, 12
; VI-NEXT: v_writelane_b32 v35, s65, 13
; VI-NEXT: v_writelane_b32 v35, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v35, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -27626,8 +27754,8 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_writelane_b32 v36, s64, 12
; GFX9-NEXT: v_writelane_b32 v36, s65, 13
; GFX9-NEXT: v_writelane_b32 v36, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v36, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -28341,56 +28469,56 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 10
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 12
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 13
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 15
; GFX11-TRUE16-NEXT: s_mov_b32 s36, s0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s53, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s52, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s51, s27
+; GFX11-TRUE16-NEXT: s_mov_b32 s50, s26
+; GFX11-TRUE16-NEXT: s_mov_b32 s49, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s48, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23
; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21
; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20
; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19
; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-TRUE16-NEXT: s_mov_b32 s48, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17
; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16
; GFX11-TRUE16-NEXT: s_mov_b32 s39, s3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-TRUE16-NEXT: s_mov_b32 s49, s25
; GFX11-TRUE16-NEXT: s_mov_b32 s38, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s37, s1
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-TRUE16-NEXT: s_mov_b32 s50, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-TRUE16-NEXT: s_mov_b32 s51, s27
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 8
-; GFX11-TRUE16-NEXT: s_mov_b32 s52, s28
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 9
-; GFX11-TRUE16-NEXT: s_mov_b32 s53, s29
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 10
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 11
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 12
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 13
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 14
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 15
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -29067,56 +29195,56 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 12
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 13
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 15
; GFX11-FAKE16-NEXT: s_mov_b32 s36, s0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s53, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s52, s28
+; GFX11-FAKE16-NEXT: s_mov_b32 s51, s27
+; GFX11-FAKE16-NEXT: s_mov_b32 s50, s26
+; GFX11-FAKE16-NEXT: s_mov_b32 s49, s25
+; GFX11-FAKE16-NEXT: s_mov_b32 s48, s24
; GFX11-FAKE16-NEXT: s_mov_b32 s47, s23
; GFX11-FAKE16-NEXT: s_mov_b32 s46, s22
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
; GFX11-FAKE16-NEXT: s_mov_b32 s45, s21
; GFX11-FAKE16-NEXT: s_mov_b32 s44, s20
; GFX11-FAKE16-NEXT: s_mov_b32 s43, s19
; GFX11-FAKE16-NEXT: s_mov_b32 s42, s18
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-FAKE16-NEXT: s_mov_b32 s48, s24
; GFX11-FAKE16-NEXT: s_mov_b32 s41, s17
; GFX11-FAKE16-NEXT: s_mov_b32 s40, s16
; GFX11-FAKE16-NEXT: s_mov_b32 s39, s3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-FAKE16-NEXT: s_mov_b32 s49, s25
; GFX11-FAKE16-NEXT: s_mov_b32 s38, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s37, s1
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-FAKE16-NEXT: s_mov_b32 s50, s26
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-FAKE16-NEXT: s_mov_b32 s51, s27
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 8
-; GFX11-FAKE16-NEXT: s_mov_b32 s52, s28
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 9
-; GFX11-FAKE16-NEXT: s_mov_b32 s53, s29
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 10
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 11
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 12
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 13
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 14
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 15
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -29837,8 +29965,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i32_to_v64f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -29855,6 +29981,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr44
@@ -29886,13 +30014,14 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB20_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16
@@ -29966,6 +30095,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
@@ -30107,6 +30237,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52
; SI-NEXT: v_or_b32_e32 v30, v30, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51
; SI-NEXT: v_or_b32_e32 v7, v7, v55
@@ -30321,8 +30452,9 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_writelane_b32 v32, s68, 18
; SI-NEXT: v_writelane_b32 v32, s69, 19
-; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_writelane_b32 v32, s30, 20
+; SI-NEXT: v_writelane_b32 v32, s31, 21
+; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_readfirstlane_b32 s5, v17
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_readfirstlane_b32 s7, v15
@@ -30342,7 +30474,6 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 21
; SI-NEXT: s_cbranch_scc0 .LBB21_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 16
@@ -31922,12 +32053,12 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v34, s97, 31
; SI-NEXT: v_writelane_b32 v34, s98, 32
; SI-NEXT: v_writelane_b32 v34, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: v_writelane_b32 v34, s30, 34
+; SI-NEXT: v_writelane_b32 v34, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16
; SI-NEXT: v_readfirstlane_b32 s8, v16
; SI-NEXT: ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v34, s31, 35
; SI-NEXT: s_lshr_b32 vcc_hi, s8, 16
; SI-NEXT: v_readfirstlane_b32 s10, v15
; SI-NEXT: v_readfirstlane_b32 s12, v14
@@ -32444,8 +32575,8 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v33, s64, 12
; VI-NEXT: v_writelane_b32 v33, s65, 13
; VI-NEXT: v_writelane_b32 v33, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v33, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -32724,8 +32855,8 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -32862,56 +32993,56 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -33012,8 +33143,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i32_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -33030,6 +33159,8 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr44
@@ -33061,13 +33192,14 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB24_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16
@@ -33141,6 +33273,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
@@ -33282,6 +33415,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52
; SI-NEXT: v_or_b32_e32 v30, v30, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51
; SI-NEXT: v_or_b32_e32 v7, v7, v55
@@ -33496,8 +33630,9 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_writelane_b32 v32, s68, 18
; SI-NEXT: v_writelane_b32 v32, s69, 19
-; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_writelane_b32 v32, s30, 20
+; SI-NEXT: v_writelane_b32 v32, s31, 21
+; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_readfirstlane_b32 s5, v17
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_readfirstlane_b32 s7, v15
@@ -33517,7 +33652,6 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 21
; SI-NEXT: s_cbranch_scc0 .LBB25_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 16
@@ -34927,12 +35061,12 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v32, s97, 31
; SI-NEXT: v_writelane_b32 v32, s98, 32
; SI-NEXT: v_writelane_b32 v32, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: v_writelane_b32 v32, s30, 34
+; SI-NEXT: v_writelane_b32 v32, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: s_lshr_b32 s14, s9, 16
; SI-NEXT: v_readfirstlane_b32 s13, v14
; SI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v32, s31, 35
; SI-NEXT: v_readfirstlane_b32 s7, v17
; SI-NEXT: v_readfirstlane_b32 s11, v15
; SI-NEXT: s_lshr_b32 s72, s13, 16
@@ -35583,8 +35717,8 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -35720,56 +35854,56 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -36092,8 +36226,8 @@ define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s64, 12
; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: v_writelane_b32 v32, s66, 14
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v32, s67, 15
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -36243,8 +36377,8 @@ define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s64, 12
; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: v_writelane_b32 v32, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -36394,8 +36528,8 @@ define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -36531,56 +36665,56 @@ define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB29_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -37514,8 +37648,8 @@ define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg %
; SI-NEXT: v_writelane_b32 v32, s64, 12
; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: v_writelane_b32 v32, s66, 14
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v32, s67, 15
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -37665,8 +37799,8 @@ define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg %
; VI-NEXT: v_writelane_b32 v32, s64, 12
; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: v_writelane_b32 v32, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -37816,8 +37950,8 @@ define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -37953,56 +38087,56 @@ define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB33_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -38277,8 +38411,8 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg %
; SI-NEXT: v_writelane_b32 v32, s64, 12
; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: v_writelane_b32 v32, s66, 14
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v32, s67, 15
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -38412,8 +38546,8 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg %
; VI-NEXT: v_writelane_b32 v32, s64, 12
; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: v_writelane_b32 v32, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -38547,8 +38681,8 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -38668,56 +38802,56 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB35_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -38802,6 +38936,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f32_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -38934,22 +39084,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -38981,13 +39115,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB36_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -39219,6 +39354,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: s_cbranch_execz .LBB36_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v31, 1.0, v31
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v32, 1.0, v32
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -40012,6 +40148,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v32f32_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -40120,22 +40272,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -40928,6 +41064,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v32f32_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -41035,23 +41187,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; kill: killed $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr39
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; kill: killed $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr35
@@ -41088,6 +41223,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
@@ -41131,7 +41267,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -41142,7 +41278,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -41328,7 +41464,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; GFX9-NEXT: s_cbranch_execz .LBB36_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32
-; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30
@@ -42209,31 +42345,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) #0 {
; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -42710,7 +42865,19 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -42746,8 +42913,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_writelane_b32 v63, s30, 34
-; SI-NEXT: v_readfirstlane_b32 s4, v19
; SI-NEXT: v_writelane_b32 v63, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s4, v19
; SI-NEXT: v_readfirstlane_b32 s45, v18
; SI-NEXT: v_readfirstlane_b32 s44, v17
; SI-NEXT: v_readfirstlane_b32 s47, v16
@@ -42767,19 +42934,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s77, v2
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s76, v1
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB37_3
@@ -44044,6 +44198,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v63, s34, 0
; VI-NEXT: v_writelane_b32 v63, s35, 1
; VI-NEXT: v_writelane_b32 v63, s36, 2
@@ -44075,8 +44243,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v63, s86, 28
; VI-NEXT: v_writelane_b32 v63, s87, 29
; VI-NEXT: v_writelane_b32 v63, s30, 30
-; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_writelane_b32 v63, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_readfirstlane_b32 s5, v18
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: v_readfirstlane_b32 s7, v16
@@ -44096,20 +44264,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s45, v2
; VI-NEXT: s_cmp_lg_u32 s44, 0
; VI-NEXT: v_readfirstlane_b32 s44, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB37_3
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -45273,6 +45427,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -45308,8 +45476,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v63, s98, 32
; GFX9-NEXT: v_writelane_b32 v63, s99, 33
; GFX9-NEXT: v_writelane_b32 v63, s30, 34
-; GFX9-NEXT: v_readfirstlane_b32 s44, v19
; GFX9-NEXT: v_writelane_b32 v63, s31, 35
+; GFX9-NEXT: v_readfirstlane_b32 s44, v19
; GFX9-NEXT: v_readfirstlane_b32 s5, v18
; GFX9-NEXT: v_readfirstlane_b32 s4, v17
; GFX9-NEXT: v_readfirstlane_b32 s7, v16
@@ -45329,20 +45497,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s45, v2
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB37_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -46481,62 +46635,51 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v74, s34, 0
-; GFX11-NEXT: v_writelane_b32 v75, s98, 0
-; GFX11-NEXT: v_readfirstlane_b32 s42, v15
-; GFX11-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-NEXT: v_writelane_b32 v74, s35, 1
-; GFX11-NEXT: v_writelane_b32 v75, s99, 1
-; GFX11-NEXT: v_readfirstlane_b32 s7, v12
-; GFX11-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-NEXT: v_writelane_b32 v74, s36, 2
-; GFX11-NEXT: v_writelane_b32 v75, s100, 2
-; GFX11-NEXT: v_readfirstlane_b32 s8, v9
-; GFX11-NEXT: v_readfirstlane_b32 s11, v8
-; GFX11-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-NEXT: v_writelane_b32 v74, s37, 3
-; GFX11-NEXT: v_writelane_b32 v75, s101, 3
-; GFX11-NEXT: v_readfirstlane_b32 s13, v6
-; GFX11-NEXT: v_readfirstlane_b32 s12, v5
-; GFX11-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-NEXT: v_writelane_b32 v74, s38, 4
-; GFX11-NEXT: v_writelane_b32 v75, s102, 4
-; GFX11-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-NEXT: v_readfirstlane_b32 s41, v2
-; GFX11-NEXT: v_readfirstlane_b32 s40, v1
-; GFX11-NEXT: v_writelane_b32 v74, s39, 5
-; GFX11-NEXT: v_writelane_b32 v75, s103, 5
-; GFX11-NEXT: s_cmp_lg_u32 s42, 0
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v73, s32
+; GFX11-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-NEXT: v_writelane_b32 v74, s39, 5
; GFX11-NEXT: v_writelane_b32 v74, s48, 6
-; GFX11-NEXT: v_writelane_b32 v75, s104, 6
-; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: v_writelane_b32 v74, s49, 7
-; GFX11-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-NEXT: v_writelane_b32 v74, s50, 8
-; GFX11-NEXT: v_writelane_b32 v75, s31, 8
; GFX11-NEXT: v_writelane_b32 v74, s51, 9
; GFX11-NEXT: v_writelane_b32 v74, s52, 10
; GFX11-NEXT: v_writelane_b32 v74, s53, 11
@@ -46560,6 +46703,34 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: v_writelane_b32 v74, s87, 29
; GFX11-NEXT: v_writelane_b32 v74, s96, 30
; GFX11-NEXT: v_writelane_b32 v74, s97, 31
+; GFX11-NEXT: v_writelane_b32 v75, s98, 0
+; GFX11-NEXT: v_writelane_b32 v75, s99, 1
+; GFX11-NEXT: v_writelane_b32 v75, s100, 2
+; GFX11-NEXT: v_writelane_b32 v75, s101, 3
+; GFX11-NEXT: v_writelane_b32 v75, s102, 4
+; GFX11-NEXT: v_writelane_b32 v75, s103, 5
+; GFX11-NEXT: v_writelane_b32 v75, s104, 6
+; GFX11-NEXT: v_writelane_b32 v75, s30, 7
+; GFX11-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-NEXT: v_readfirstlane_b32 s42, v15
+; GFX11-NEXT: v_readfirstlane_b32 s5, v14
+; GFX11-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-NEXT: v_readfirstlane_b32 s6, v11
+; GFX11-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-NEXT: v_readfirstlane_b32 s11, v8
+; GFX11-NEXT: v_readfirstlane_b32 s10, v7
+; GFX11-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-NEXT: v_readfirstlane_b32 s12, v5
+; GFX11-NEXT: v_readfirstlane_b32 s15, v4
+; GFX11-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-NEXT: v_readfirstlane_b32 s41, v2
+; GFX11-NEXT: v_readfirstlane_b32 s40, v1
+; GFX11-NEXT: s_cmp_lg_u32 s42, 0
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
+; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: s_cbranch_scc0 .LBB37_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s5, 24
@@ -51435,53 +51606,99 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:384
@@ -52234,53 +52451,99 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
@@ -53129,8 +53392,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; SI-LABEL: bitcast_v128i8_to_v32f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s76, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -53147,6 +53408,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_readfirstlane_b32 s76, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; SI-NEXT: v_readfirstlane_b32 s46, v30
; SI-NEXT: v_readfirstlane_b32 s6, v29
; SI-NEXT: v_readfirstlane_b32 s7, v28
@@ -54272,15 +54535,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-LABEL: bitcast_v128i8_to_v32f32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s45, v15
-; VI-NEXT: v_readfirstlane_b32 s46, v14
-; VI-NEXT: v_readfirstlane_b32 s47, v13
-; VI-NEXT: v_readfirstlane_b32 s76, v0
-; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -54297,6 +54551,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_readfirstlane_b32 s45, v15
+; VI-NEXT: v_readfirstlane_b32 s46, v14
+; VI-NEXT: v_readfirstlane_b32 s47, v13
+; VI-NEXT: v_readfirstlane_b32 s76, v0
+; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: v_readfirstlane_b32 s6, v30
; VI-NEXT: v_readfirstlane_b32 s7, v29
; VI-NEXT: v_readfirstlane_b32 s8, v28
@@ -54324,7 +54587,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s73, v3
; VI-NEXT: v_readfirstlane_b32 s74, v2
; VI-NEXT: v_readfirstlane_b32 s75, v1
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -55157,16 +55420,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-LABEL: bitcast_v128i8_to_v32f32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s45, v15
-; GFX9-NEXT: v_readfirstlane_b32 s46, v14
-; GFX9-NEXT: v_readfirstlane_b32 s47, v13
-; GFX9-NEXT: v_readfirstlane_b32 s76, v0
-; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -55183,6 +55436,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_readfirstlane_b32 s45, v15
+; GFX9-NEXT: v_readfirstlane_b32 s46, v14
+; GFX9-NEXT: v_readfirstlane_b32 s47, v13
+; GFX9-NEXT: v_readfirstlane_b32 s76, v0
+; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; GFX9-NEXT: v_readfirstlane_b32 s6, v30
; GFX9-NEXT: v_readfirstlane_b32 s7, v29
; GFX9-NEXT: v_readfirstlane_b32 s8, v28
@@ -55210,7 +55472,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s73, v3
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s75, v1
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -56053,6 +56315,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-TRUE16-NEXT: ; meta instruction
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
@@ -56168,9 +56434,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s74, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s75, 0
-; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -56626,6 +56889,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-FAKE16-NEXT: ; meta instruction
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-FAKE16-NEXT: s_clause 0x1f
; GFX11-FAKE16-NEXT: scratch_load_u16 v32, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v34, off, s32 offset:308
@@ -56741,9 +57008,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s74, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s75, 0
-; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -58003,7 +58267,20 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -58039,8 +58316,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_writelane_b32 v63, s30, 34
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v63, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: v_readfirstlane_b32 s7, v16
; SI-NEXT: v_readfirstlane_b32 s8, v15
@@ -58060,20 +58337,6 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: v_readfirstlane_b32 s46, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s47, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB41_3
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -58648,8 +58911,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -58773,8 +59036,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -58890,36 +59153,36 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB41_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -59012,8 +59275,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64bf16_to_v32f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -59030,6 +59291,8 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0
; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v1
@@ -59114,8 +59377,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v54
; SI-NEXT: v_mul_f32_e32 v60, 1.0, v53
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v37
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38
; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -59200,7 +59465,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18
; SI-NEXT: v_mul_f32_e32 v58, 1.0, v52
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51
@@ -60827,26 +61091,41 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) #0 {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -62007,14 +62286,28 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
; SI-NEXT: v_writelane_b32 v63, s37, 3
; SI-NEXT: v_writelane_b32 v63, s30, 4
-; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: v_writelane_b32 v63, s31, 5
+; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: s_and_b32 s12, s25, 0xffff0000
; SI-NEXT: s_and_b32 s30, vcc_lo, 0xffff0000
; SI-NEXT: s_lshl_b32 s31, vcc_lo, 16
@@ -62149,27 +62442,11 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s43
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v53, 1.0, s40
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s29
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s28
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v62, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s26
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25
@@ -62847,8 +63124,8 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_writelane_b32 v35, s64, 12
; VI-NEXT: v_writelane_b32 v35, s65, 13
; VI-NEXT: v_writelane_b32 v35, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v35, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -63558,8 +63835,8 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX9-NEXT: v_writelane_b32 v36, s64, 12
; GFX9-NEXT: v_writelane_b32 v36, s65, 13
; GFX9-NEXT: v_writelane_b32 v36, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v36, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -64273,56 +64550,56 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 10
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 12
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 13
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 15
; GFX11-TRUE16-NEXT: s_mov_b32 s36, s0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s53, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s52, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s51, s27
+; GFX11-TRUE16-NEXT: s_mov_b32 s50, s26
+; GFX11-TRUE16-NEXT: s_mov_b32 s49, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s48, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23
; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21
; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20
; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19
; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-TRUE16-NEXT: s_mov_b32 s48, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17
; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16
; GFX11-TRUE16-NEXT: s_mov_b32 s39, s3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-TRUE16-NEXT: s_mov_b32 s49, s25
; GFX11-TRUE16-NEXT: s_mov_b32 s38, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s37, s1
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-TRUE16-NEXT: s_mov_b32 s50, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-TRUE16-NEXT: s_mov_b32 s51, s27
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 8
-; GFX11-TRUE16-NEXT: s_mov_b32 s52, s28
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 9
-; GFX11-TRUE16-NEXT: s_mov_b32 s53, s29
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 10
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 11
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 12
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 13
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 14
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 15
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -64999,56 +65276,56 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 12
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 13
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 15
; GFX11-FAKE16-NEXT: s_mov_b32 s36, s0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s53, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s52, s28
+; GFX11-FAKE16-NEXT: s_mov_b32 s51, s27
+; GFX11-FAKE16-NEXT: s_mov_b32 s50, s26
+; GFX11-FAKE16-NEXT: s_mov_b32 s49, s25
+; GFX11-FAKE16-NEXT: s_mov_b32 s48, s24
; GFX11-FAKE16-NEXT: s_mov_b32 s47, s23
; GFX11-FAKE16-NEXT: s_mov_b32 s46, s22
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
; GFX11-FAKE16-NEXT: s_mov_b32 s45, s21
; GFX11-FAKE16-NEXT: s_mov_b32 s44, s20
; GFX11-FAKE16-NEXT: s_mov_b32 s43, s19
; GFX11-FAKE16-NEXT: s_mov_b32 s42, s18
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-FAKE16-NEXT: s_mov_b32 s48, s24
; GFX11-FAKE16-NEXT: s_mov_b32 s41, s17
; GFX11-FAKE16-NEXT: s_mov_b32 s40, s16
; GFX11-FAKE16-NEXT: s_mov_b32 s39, s3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-FAKE16-NEXT: s_mov_b32 s49, s25
; GFX11-FAKE16-NEXT: s_mov_b32 s38, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s37, s1
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-FAKE16-NEXT: s_mov_b32 s50, s26
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-FAKE16-NEXT: s_mov_b32 s51, s27
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 8
-; GFX11-FAKE16-NEXT: s_mov_b32 s52, s28
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 9
-; GFX11-FAKE16-NEXT: s_mov_b32 s53, s29
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 10
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 11
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 12
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 13
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 14
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 15
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -65769,8 +66046,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f32_to_v64f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -65787,6 +66062,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr44
@@ -65818,13 +66095,14 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB44_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16
@@ -65898,6 +66176,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: v_add_f32_e32 v26, 1.0, v26
; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v31, 1.0, v31
; SI-NEXT: v_add_f32_e32 v30, 1.0, v30
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
@@ -66039,6 +66318,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52
; SI-NEXT: v_or_b32_e32 v30, v30, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51
; SI-NEXT: v_or_b32_e32 v7, v7, v55
@@ -66216,7 +66496,21 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -66238,8 +66532,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v63, s68, 18
; SI-NEXT: v_writelane_b32 v63, s69, 19
; SI-NEXT: v_writelane_b32 v63, s30, 20
-; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_writelane_b32 v63, s31, 21
+; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_readfirstlane_b32 s5, v17
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_readfirstlane_b32 s7, v15
@@ -66259,21 +66553,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB45_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s69, s5, 16
@@ -66640,8 +66919,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -66765,8 +67044,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -66882,36 +67161,36 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB45_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -68002,12 +68281,12 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v34, s97, 31
; SI-NEXT: v_writelane_b32 v34, s98, 32
; SI-NEXT: v_writelane_b32 v34, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: v_writelane_b32 v34, s30, 34
+; SI-NEXT: v_writelane_b32 v34, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16
; SI-NEXT: v_readfirstlane_b32 s8, v16
; SI-NEXT: ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v34, s31, 35
; SI-NEXT: s_lshr_b32 vcc_hi, s8, 16
; SI-NEXT: v_readfirstlane_b32 s10, v15
; SI-NEXT: v_readfirstlane_b32 s12, v14
@@ -68524,8 +68803,8 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; VI-NEXT: v_writelane_b32 v33, s64, 12
; VI-NEXT: v_writelane_b32 v33, s65, 13
; VI-NEXT: v_writelane_b32 v33, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v33, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -68804,8 +69083,8 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -68942,56 +69221,56 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -69092,8 +69371,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f32_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -69110,6 +69387,8 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr44
@@ -69141,13 +69420,14 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB48_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16
@@ -69221,6 +69501,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: v_add_f32_e32 v26, 1.0, v26
; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v31, 1.0, v31
; SI-NEXT: v_add_f32_e32 v30, 1.0, v30
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
@@ -69362,6 +69643,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52
; SI-NEXT: v_or_b32_e32 v30, v30, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51
; SI-NEXT: v_or_b32_e32 v7, v7, v55
@@ -69539,7 +69821,21 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -69561,8 +69857,8 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v63, s68, 18
; SI-NEXT: v_writelane_b32 v63, s69, 19
; SI-NEXT: v_writelane_b32 v63, s30, 20
-; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_writelane_b32 v63, s31, 21
+; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_readfirstlane_b32 s5, v17
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_readfirstlane_b32 s7, v15
@@ -69582,21 +69878,6 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB49_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s69, s5, 16
@@ -69963,8 +70244,8 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -70088,8 +70369,8 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -70205,36 +70486,36 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB49_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -71155,12 +71436,12 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s97, 31
; SI-NEXT: v_writelane_b32 v32, s98, 32
; SI-NEXT: v_writelane_b32 v32, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: v_writelane_b32 v32, s30, 34
+; SI-NEXT: v_writelane_b32 v32, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: s_lshr_b32 s14, s9, 16
; SI-NEXT: v_readfirstlane_b32 s13, v14
; SI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v32, s31, 35
; SI-NEXT: v_readfirstlane_b32 s7, v17
; SI-NEXT: v_readfirstlane_b32 s11, v15
; SI-NEXT: s_lshr_b32 s72, s13, 16
@@ -71811,8 +72092,8 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -71948,56 +72229,56 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB51_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -72882,8 +73163,8 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s64, 12
; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: v_writelane_b32 v32, s66, 14
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v32, s67, 15
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -73017,8 +73298,8 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s64, 12
; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: v_writelane_b32 v32, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -73152,8 +73433,8 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -73273,56 +73554,56 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB55_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -73407,6 +73688,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i64_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -73539,22 +73836,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -73586,13 +73867,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB56_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -73853,6 +74135,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
@@ -74617,6 +74900,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v16i64_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -74725,22 +75024,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -75533,6 +75816,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v16i64_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -75640,23 +75939,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; kill: killed $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr39
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; kill: killed $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr35
@@ -75693,6 +75975,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
@@ -75736,7 +76019,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -75747,7 +76030,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -75962,7 +76245,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v28, vcc
; GFX9-NEXT: v_add_co_u32_e32 v29, vcc, 3, v29
; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, 0, v30, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: v_add_co_u32_e32 v31, vcc, 3, v31
; GFX9-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v32, vcc
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -76839,31 +77122,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) #0 {
; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -77400,8 +77702,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_writelane_b32 v20, s97, 31
; SI-NEXT: v_writelane_b32 v20, s98, 32
; SI-NEXT: v_writelane_b32 v20, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_writelane_b32 v20, s30, 34
+; SI-NEXT: v_writelane_b32 v20, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_readfirstlane_b32 s5, v18
; SI-NEXT: v_readfirstlane_b32 s4, v17
; SI-NEXT: v_readfirstlane_b32 s7, v16
@@ -77421,7 +77724,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s45, v2
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v1
-; SI-NEXT: v_writelane_b32 v20, s31, 35
; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB57_4
@@ -78560,8 +78862,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: v_writelane_b32 v32, s87, 29
-; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_readfirstlane_b32 s5, v18
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: v_readfirstlane_b32 s7, v16
@@ -78581,7 +78884,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s45, v2
; VI-NEXT: s_cmp_lg_u32 s44, 0
; VI-NEXT: v_readfirstlane_b32 s44, v1
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB57_4
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -79478,8 +79780,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v29, s97, 31
; GFX9-NEXT: v_writelane_b32 v29, s98, 32
; GFX9-NEXT: v_writelane_b32 v29, s99, 33
-; GFX9-NEXT: v_readfirstlane_b32 s44, v19
; GFX9-NEXT: v_writelane_b32 v29, s30, 34
+; GFX9-NEXT: v_writelane_b32 v29, s31, 35
+; GFX9-NEXT: v_readfirstlane_b32 s44, v19
; GFX9-NEXT: v_readfirstlane_b32 s5, v18
; GFX9-NEXT: v_readfirstlane_b32 s4, v17
; GFX9-NEXT: v_readfirstlane_b32 s7, v16
@@ -79499,7 +79802,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s45, v2
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
-; GFX9-NEXT: v_writelane_b32 v29, s31, 35
; GFX9-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB57_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -80307,42 +80609,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v27, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v24, s34, 0
-; GFX11-NEXT: v_writelane_b32 v25, s98, 0
-; GFX11-NEXT: v_readfirstlane_b32 s42, v15
-; GFX11-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-NEXT: v_readfirstlane_b32 s4, v13
; GFX11-NEXT: v_writelane_b32 v24, s35, 1
-; GFX11-NEXT: v_writelane_b32 v25, s99, 1
-; GFX11-NEXT: v_readfirstlane_b32 s7, v12
-; GFX11-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-NEXT: v_readfirstlane_b32 s9, v10
; GFX11-NEXT: v_writelane_b32 v24, s36, 2
-; GFX11-NEXT: v_writelane_b32 v25, s100, 2
-; GFX11-NEXT: v_readfirstlane_b32 s8, v9
-; GFX11-NEXT: v_readfirstlane_b32 s11, v8
-; GFX11-NEXT: v_readfirstlane_b32 s10, v7
; GFX11-NEXT: v_writelane_b32 v24, s37, 3
-; GFX11-NEXT: v_writelane_b32 v25, s101, 3
-; GFX11-NEXT: v_readfirstlane_b32 s13, v6
-; GFX11-NEXT: v_readfirstlane_b32 s12, v5
-; GFX11-NEXT: v_readfirstlane_b32 s15, v4
; GFX11-NEXT: v_writelane_b32 v24, s38, 4
-; GFX11-NEXT: v_writelane_b32 v25, s102, 4
-; GFX11-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-NEXT: v_readfirstlane_b32 s41, v2
-; GFX11-NEXT: v_readfirstlane_b32 s40, v1
; GFX11-NEXT: v_writelane_b32 v24, s39, 5
-; GFX11-NEXT: v_writelane_b32 v25, s103, 5
-; GFX11-NEXT: s_cmp_lg_u32 s42, 0
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
-; GFX11-NEXT: ; implicit-def: $vgpr27 : SGPR spill to VGPR lane
-; GFX11-NEXT: ; implicit-def: $vgpr26 : SGPR spill to VGPR lane
; GFX11-NEXT: v_writelane_b32 v24, s48, 6
-; GFX11-NEXT: v_writelane_b32 v25, s104, 6
; GFX11-NEXT: v_writelane_b32 v24, s49, 7
-; GFX11-NEXT: v_writelane_b32 v25, s30, 7
; GFX11-NEXT: v_writelane_b32 v24, s50, 8
-; GFX11-NEXT: v_writelane_b32 v25, s31, 8
; GFX11-NEXT: v_writelane_b32 v24, s51, 9
; GFX11-NEXT: v_writelane_b32 v24, s52, 10
; GFX11-NEXT: v_writelane_b32 v24, s53, 11
@@ -80366,6 +80640,34 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_writelane_b32 v24, s87, 29
; GFX11-NEXT: v_writelane_b32 v24, s96, 30
; GFX11-NEXT: v_writelane_b32 v24, s97, 31
+; GFX11-NEXT: v_writelane_b32 v25, s98, 0
+; GFX11-NEXT: v_writelane_b32 v25, s99, 1
+; GFX11-NEXT: v_writelane_b32 v25, s100, 2
+; GFX11-NEXT: v_writelane_b32 v25, s101, 3
+; GFX11-NEXT: v_writelane_b32 v25, s102, 4
+; GFX11-NEXT: v_writelane_b32 v25, s103, 5
+; GFX11-NEXT: v_writelane_b32 v25, s104, 6
+; GFX11-NEXT: v_writelane_b32 v25, s30, 7
+; GFX11-NEXT: v_writelane_b32 v25, s31, 8
+; GFX11-NEXT: v_readfirstlane_b32 s42, v15
+; GFX11-NEXT: v_readfirstlane_b32 s5, v14
+; GFX11-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-NEXT: v_readfirstlane_b32 s6, v11
+; GFX11-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-NEXT: v_readfirstlane_b32 s11, v8
+; GFX11-NEXT: v_readfirstlane_b32 s10, v7
+; GFX11-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-NEXT: v_readfirstlane_b32 s12, v5
+; GFX11-NEXT: v_readfirstlane_b32 s15, v4
+; GFX11-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-NEXT: v_readfirstlane_b32 s41, v2
+; GFX11-NEXT: v_readfirstlane_b32 s40, v1
+; GFX11-NEXT: s_cmp_lg_u32 s42, 0
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-NEXT: ; implicit-def: $vgpr27 : SGPR spill to VGPR lane
+; GFX11-NEXT: ; implicit-def: $vgpr26 : SGPR spill to VGPR lane
; GFX11-NEXT: s_cbranch_scc0 .LBB57_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s5, 24
@@ -85181,53 +85483,99 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:384
@@ -85980,53 +86328,99 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
@@ -86875,8 +87269,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; SI-LABEL: bitcast_v128i8_to_v16i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s76, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -86893,6 +87285,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_readfirstlane_b32 s76, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; SI-NEXT: v_readfirstlane_b32 s46, v30
; SI-NEXT: v_readfirstlane_b32 s6, v29
; SI-NEXT: v_readfirstlane_b32 s7, v28
@@ -88018,15 +88412,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-LABEL: bitcast_v128i8_to_v16i64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s45, v15
-; VI-NEXT: v_readfirstlane_b32 s46, v14
-; VI-NEXT: v_readfirstlane_b32 s47, v13
-; VI-NEXT: v_readfirstlane_b32 s76, v0
-; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -88043,6 +88428,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_readfirstlane_b32 s45, v15
+; VI-NEXT: v_readfirstlane_b32 s46, v14
+; VI-NEXT: v_readfirstlane_b32 s47, v13
+; VI-NEXT: v_readfirstlane_b32 s76, v0
+; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: v_readfirstlane_b32 s6, v30
; VI-NEXT: v_readfirstlane_b32 s7, v29
; VI-NEXT: v_readfirstlane_b32 s8, v28
@@ -88070,7 +88464,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s73, v3
; VI-NEXT: v_readfirstlane_b32 s74, v2
; VI-NEXT: v_readfirstlane_b32 s75, v1
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -88903,16 +89297,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-LABEL: bitcast_v128i8_to_v16i64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s45, v15
-; GFX9-NEXT: v_readfirstlane_b32 s46, v14
-; GFX9-NEXT: v_readfirstlane_b32 s47, v13
-; GFX9-NEXT: v_readfirstlane_b32 s76, v0
-; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -88929,6 +89313,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_readfirstlane_b32 s45, v15
+; GFX9-NEXT: v_readfirstlane_b32 s46, v14
+; GFX9-NEXT: v_readfirstlane_b32 s47, v13
+; GFX9-NEXT: v_readfirstlane_b32 s76, v0
+; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; GFX9-NEXT: v_readfirstlane_b32 s6, v30
; GFX9-NEXT: v_readfirstlane_b32 s7, v29
; GFX9-NEXT: v_readfirstlane_b32 s8, v28
@@ -88956,7 +89349,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s73, v3
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s75, v1
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -89799,6 +90192,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-TRUE16-NEXT: ; meta instruction
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
@@ -89914,9 +90311,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s74, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s75, 0
-; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -90372,6 +90766,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-FAKE16-NEXT: ; meta instruction
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-FAKE16-NEXT: s_clause 0x1f
; GFX11-FAKE16-NEXT: scratch_load_u16 v32, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v34, off, s32 offset:308
@@ -90487,9 +90885,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s74, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s75, 0
-; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -91808,8 +92203,9 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: v_writelane_b32 v33, s97, 31
; SI-NEXT: v_writelane_b32 v33, s98, 32
; SI-NEXT: v_writelane_b32 v33, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v33, s30, 34
+; SI-NEXT: v_writelane_b32 v33, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_readfirstlane_b32 s70, v17
; SI-NEXT: v_readfirstlane_b32 s71, v16
; SI-NEXT: v_readfirstlane_b32 s80, v15
@@ -91829,7 +92225,6 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: v_readfirstlane_b32 s8, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s9, v0
-; SI-NEXT: v_writelane_b32 v33, s31, 35
; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB61_4
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -92542,8 +92937,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64bf16_to_v16i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -92560,6 +92953,8 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0
; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v1
@@ -92644,8 +93039,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v54
; SI-NEXT: v_mul_f32_e32 v60, 1.0, v53
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v37
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38
; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -92730,7 +93127,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18
; SI-NEXT: v_mul_f32_e32 v58, 1.0, v52
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51
@@ -94357,26 +94753,41 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) #0 {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -95537,14 +95948,28 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
; SI-NEXT: v_writelane_b32 v63, s37, 3
; SI-NEXT: v_writelane_b32 v63, s30, 4
-; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: v_writelane_b32 v63, s31, 5
+; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: s_and_b32 s12, s25, 0xffff0000
; SI-NEXT: s_and_b32 s30, vcc_lo, 0xffff0000
; SI-NEXT: s_lshl_b32 s31, vcc_lo, 16
@@ -95679,27 +96104,11 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s43
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v53, 1.0, s40
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s29
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s28
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v62, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s26
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25
@@ -96377,8 +96786,8 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_writelane_b32 v35, s64, 12
; VI-NEXT: v_writelane_b32 v35, s65, 13
; VI-NEXT: v_writelane_b32 v35, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v35, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -97088,8 +97497,8 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_writelane_b32 v36, s64, 12
; GFX9-NEXT: v_writelane_b32 v36, s65, 13
; GFX9-NEXT: v_writelane_b32 v36, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v36, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -97803,56 +98212,56 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 10
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 12
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 13
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 15
; GFX11-TRUE16-NEXT: s_mov_b32 s36, s0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s53, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s52, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s51, s27
+; GFX11-TRUE16-NEXT: s_mov_b32 s50, s26
+; GFX11-TRUE16-NEXT: s_mov_b32 s49, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s48, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23
; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21
; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20
; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19
; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-TRUE16-NEXT: s_mov_b32 s48, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17
; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16
; GFX11-TRUE16-NEXT: s_mov_b32 s39, s3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-TRUE16-NEXT: s_mov_b32 s49, s25
; GFX11-TRUE16-NEXT: s_mov_b32 s38, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s37, s1
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-TRUE16-NEXT: s_mov_b32 s50, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-TRUE16-NEXT: s_mov_b32 s51, s27
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 8
-; GFX11-TRUE16-NEXT: s_mov_b32 s52, s28
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 9
-; GFX11-TRUE16-NEXT: s_mov_b32 s53, s29
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 10
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 11
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 12
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 13
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 14
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 15
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -98529,56 +98938,56 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 12
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 13
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 15
; GFX11-FAKE16-NEXT: s_mov_b32 s36, s0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s53, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s52, s28
+; GFX11-FAKE16-NEXT: s_mov_b32 s51, s27
+; GFX11-FAKE16-NEXT: s_mov_b32 s50, s26
+; GFX11-FAKE16-NEXT: s_mov_b32 s49, s25
+; GFX11-FAKE16-NEXT: s_mov_b32 s48, s24
; GFX11-FAKE16-NEXT: s_mov_b32 s47, s23
; GFX11-FAKE16-NEXT: s_mov_b32 s46, s22
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
; GFX11-FAKE16-NEXT: s_mov_b32 s45, s21
; GFX11-FAKE16-NEXT: s_mov_b32 s44, s20
; GFX11-FAKE16-NEXT: s_mov_b32 s43, s19
; GFX11-FAKE16-NEXT: s_mov_b32 s42, s18
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-FAKE16-NEXT: s_mov_b32 s48, s24
; GFX11-FAKE16-NEXT: s_mov_b32 s41, s17
; GFX11-FAKE16-NEXT: s_mov_b32 s40, s16
; GFX11-FAKE16-NEXT: s_mov_b32 s39, s3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-FAKE16-NEXT: s_mov_b32 s49, s25
; GFX11-FAKE16-NEXT: s_mov_b32 s38, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s37, s1
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-FAKE16-NEXT: s_mov_b32 s50, s26
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-FAKE16-NEXT: s_mov_b32 s51, s27
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 8
-; GFX11-FAKE16-NEXT: s_mov_b32 s52, s28
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 9
-; GFX11-FAKE16-NEXT: s_mov_b32 s53, s29
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 10
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 11
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 12
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 13
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 14
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 15
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -99299,8 +99708,6 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i64_to_v64f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -99317,6 +99724,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr44
@@ -99348,13 +99757,14 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB64_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16
@@ -99429,6 +99839,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
@@ -99569,6 +99980,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53
; SI-NEXT: v_or_b32_e32 v30, v30, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51
; SI-NEXT: v_or_b32_e32 v7, v7, v54
@@ -99791,8 +100203,9 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_writelane_b32 v32, s68, 18
; SI-NEXT: v_writelane_b32 v32, s69, 19
-; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_writelane_b32 v32, s30, 20
+; SI-NEXT: v_writelane_b32 v32, s31, 21
+; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_readfirstlane_b32 s5, v17
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_readfirstlane_b32 s7, v15
@@ -99812,7 +100225,6 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 21
; SI-NEXT: s_cbranch_scc0 .LBB65_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 16
@@ -101392,12 +101804,12 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v34, s97, 31
; SI-NEXT: v_writelane_b32 v34, s98, 32
; SI-NEXT: v_writelane_b32 v34, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: v_writelane_b32 v34, s30, 34
+; SI-NEXT: v_writelane_b32 v34, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16
; SI-NEXT: v_readfirstlane_b32 s8, v16
; SI-NEXT: ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v34, s31, 35
; SI-NEXT: s_lshr_b32 vcc_hi, s8, 16
; SI-NEXT: v_readfirstlane_b32 s10, v15
; SI-NEXT: v_readfirstlane_b32 s12, v14
@@ -101914,8 +102326,8 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v33, s64, 12
; VI-NEXT: v_writelane_b32 v33, s65, 13
; VI-NEXT: v_writelane_b32 v33, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v33, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -102194,8 +102606,8 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -102332,56 +102744,56 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -102482,8 +102894,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i64_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -102500,6 +102910,8 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr44
@@ -102531,13 +102943,14 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB68_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16
@@ -102612,6 +103025,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
@@ -102752,6 +103166,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53
; SI-NEXT: v_or_b32_e32 v30, v30, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51
; SI-NEXT: v_or_b32_e32 v7, v7, v54
@@ -102974,8 +103389,9 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_writelane_b32 v32, s68, 18
; SI-NEXT: v_writelane_b32 v32, s69, 19
-; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_writelane_b32 v32, s30, 20
+; SI-NEXT: v_writelane_b32 v32, s31, 21
+; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_readfirstlane_b32 s5, v17
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_readfirstlane_b32 s7, v15
@@ -102995,7 +103411,6 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 21
; SI-NEXT: s_cbranch_scc0 .LBB69_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 16
@@ -104405,12 +104820,12 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v32, s97, 31
; SI-NEXT: v_writelane_b32 v32, s98, 32
; SI-NEXT: v_writelane_b32 v32, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: v_writelane_b32 v32, s30, 34
+; SI-NEXT: v_writelane_b32 v32, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: s_lshr_b32 s14, s9, 16
; SI-NEXT: v_readfirstlane_b32 s13, v14
; SI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v32, s31, 35
; SI-NEXT: v_readfirstlane_b32 s7, v17
; SI-NEXT: v_readfirstlane_b32 s11, v15
; SI-NEXT: s_lshr_b32 s72, s13, 16
@@ -105061,8 +105476,8 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -105198,56 +105613,56 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB71_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -105348,6 +105763,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f64_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -105480,22 +105911,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr42
@@ -105527,13 +105942,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB72_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -105764,6 +106180,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB72_4
; SI-NEXT: ; %bb.3: ; %cmp.true
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24
@@ -106542,6 +106959,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v16f64_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -106644,22 +107077,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr57
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -107443,6 +107860,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v16f64_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -107539,23 +107972,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; kill: killed $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr39
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr39
@@ -107565,6 +107981,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr39
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -107643,7 +108060,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: s_waitcnt vmcnt(25)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -107656,7 +108073,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: s_waitcnt vmcnt(25)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -107845,7 +108262,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB72_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
-; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
@@ -108710,31 +109127,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) #0 {
; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -109211,7 +109647,19 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -109247,8 +109695,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_writelane_b32 v63, s30, 34
-; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_writelane_b32 v63, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_readfirstlane_b32 s5, v18
; SI-NEXT: v_readfirstlane_b32 s4, v17
; SI-NEXT: v_readfirstlane_b32 s7, v16
@@ -109268,19 +109716,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s45, v2
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v1
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB73_3
@@ -110603,6 +111038,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v63, s34, 0
; VI-NEXT: v_writelane_b32 v63, s35, 1
; VI-NEXT: v_writelane_b32 v63, s36, 2
@@ -110634,8 +111083,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v63, s86, 28
; VI-NEXT: v_writelane_b32 v63, s87, 29
; VI-NEXT: v_writelane_b32 v63, s30, 30
-; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_writelane_b32 v63, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_readfirstlane_b32 s5, v18
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: v_readfirstlane_b32 s7, v16
@@ -110655,20 +111104,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s45, v2
; VI-NEXT: s_cmp_lg_u32 s44, 0
; VI-NEXT: v_readfirstlane_b32 s44, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB73_3
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -111831,6 +112266,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -111866,8 +112315,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v63, s98, 32
; GFX9-NEXT: v_writelane_b32 v63, s99, 33
; GFX9-NEXT: v_writelane_b32 v63, s30, 34
-; GFX9-NEXT: v_readfirstlane_b32 s44, v19
; GFX9-NEXT: v_writelane_b32 v63, s31, 35
+; GFX9-NEXT: v_readfirstlane_b32 s44, v19
; GFX9-NEXT: v_readfirstlane_b32 s5, v18
; GFX9-NEXT: v_readfirstlane_b32 s4, v17
; GFX9-NEXT: v_readfirstlane_b32 s7, v16
@@ -111887,20 +112336,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s45, v2
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB73_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -113064,62 +113499,51 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v74, s34, 0
-; GFX11-NEXT: v_writelane_b32 v75, s98, 0
-; GFX11-NEXT: v_readfirstlane_b32 s42, v15
-; GFX11-NEXT: v_readfirstlane_b32 s41, v14
-; GFX11-NEXT: v_readfirstlane_b32 s40, v13
-; GFX11-NEXT: v_writelane_b32 v74, s35, 1
-; GFX11-NEXT: v_writelane_b32 v75, s99, 1
-; GFX11-NEXT: v_readfirstlane_b32 s15, v12
-; GFX11-NEXT: v_readfirstlane_b32 s14, v11
-; GFX11-NEXT: v_readfirstlane_b32 s11, v10
-; GFX11-NEXT: v_writelane_b32 v74, s36, 2
-; GFX11-NEXT: v_writelane_b32 v75, s100, 2
-; GFX11-NEXT: v_readfirstlane_b32 s10, v9
-; GFX11-NEXT: v_readfirstlane_b32 s9, v8
-; GFX11-NEXT: v_readfirstlane_b32 s8, v7
-; GFX11-NEXT: v_writelane_b32 v74, s37, 3
-; GFX11-NEXT: v_writelane_b32 v75, s101, 3
-; GFX11-NEXT: v_readfirstlane_b32 s7, v6
-; GFX11-NEXT: v_readfirstlane_b32 s6, v5
-; GFX11-NEXT: v_readfirstlane_b32 s5, v4
-; GFX11-NEXT: v_writelane_b32 v74, s38, 4
-; GFX11-NEXT: v_writelane_b32 v75, s102, 4
-; GFX11-NEXT: v_readfirstlane_b32 s4, v3
-; GFX11-NEXT: v_readfirstlane_b32 s13, v2
-; GFX11-NEXT: v_readfirstlane_b32 s12, v1
-; GFX11-NEXT: v_writelane_b32 v74, s39, 5
-; GFX11-NEXT: v_writelane_b32 v75, s103, 5
-; GFX11-NEXT: s_cmp_lg_u32 s42, 0
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v73, s32
+; GFX11-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-NEXT: v_writelane_b32 v74, s39, 5
; GFX11-NEXT: v_writelane_b32 v74, s48, 6
-; GFX11-NEXT: v_writelane_b32 v75, s104, 6
-; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: v_writelane_b32 v74, s49, 7
-; GFX11-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-NEXT: v_writelane_b32 v74, s50, 8
-; GFX11-NEXT: v_writelane_b32 v75, s31, 8
; GFX11-NEXT: v_writelane_b32 v74, s51, 9
; GFX11-NEXT: v_writelane_b32 v74, s52, 10
; GFX11-NEXT: v_writelane_b32 v74, s53, 11
@@ -113143,6 +113567,34 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: v_writelane_b32 v74, s87, 29
; GFX11-NEXT: v_writelane_b32 v74, s96, 30
; GFX11-NEXT: v_writelane_b32 v74, s97, 31
+; GFX11-NEXT: v_writelane_b32 v75, s98, 0
+; GFX11-NEXT: v_writelane_b32 v75, s99, 1
+; GFX11-NEXT: v_writelane_b32 v75, s100, 2
+; GFX11-NEXT: v_writelane_b32 v75, s101, 3
+; GFX11-NEXT: v_writelane_b32 v75, s102, 4
+; GFX11-NEXT: v_writelane_b32 v75, s103, 5
+; GFX11-NEXT: v_writelane_b32 v75, s104, 6
+; GFX11-NEXT: v_writelane_b32 v75, s30, 7
+; GFX11-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-NEXT: v_readfirstlane_b32 s42, v15
+; GFX11-NEXT: v_readfirstlane_b32 s41, v14
+; GFX11-NEXT: v_readfirstlane_b32 s40, v13
+; GFX11-NEXT: v_readfirstlane_b32 s15, v12
+; GFX11-NEXT: v_readfirstlane_b32 s14, v11
+; GFX11-NEXT: v_readfirstlane_b32 s11, v10
+; GFX11-NEXT: v_readfirstlane_b32 s10, v9
+; GFX11-NEXT: v_readfirstlane_b32 s9, v8
+; GFX11-NEXT: v_readfirstlane_b32 s8, v7
+; GFX11-NEXT: v_readfirstlane_b32 s7, v6
+; GFX11-NEXT: v_readfirstlane_b32 s6, v5
+; GFX11-NEXT: v_readfirstlane_b32 s5, v4
+; GFX11-NEXT: v_readfirstlane_b32 s4, v3
+; GFX11-NEXT: v_readfirstlane_b32 s13, v2
+; GFX11-NEXT: v_readfirstlane_b32 s12, v1
+; GFX11-NEXT: s_cmp_lg_u32 s42, 0
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
+; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: s_cbranch_scc0 .LBB73_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s11, 16
@@ -118004,53 +118456,99 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:384
@@ -118803,53 +119301,99 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v86, v0
; GFX11-FAKE16-NEXT: s_clause 0x1f
@@ -119698,8 +120242,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; SI-LABEL: bitcast_v128i8_to_v16f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s76, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -119716,6 +120258,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_readfirstlane_b32 s76, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; SI-NEXT: v_readfirstlane_b32 s46, v30
; SI-NEXT: v_readfirstlane_b32 s6, v29
; SI-NEXT: v_readfirstlane_b32 s7, v28
@@ -120841,15 +121385,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-LABEL: bitcast_v128i8_to_v16f64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s45, v15
-; VI-NEXT: v_readfirstlane_b32 s46, v14
-; VI-NEXT: v_readfirstlane_b32 s47, v13
-; VI-NEXT: v_readfirstlane_b32 s76, v0
-; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -120866,6 +121401,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_readfirstlane_b32 s45, v15
+; VI-NEXT: v_readfirstlane_b32 s46, v14
+; VI-NEXT: v_readfirstlane_b32 s47, v13
+; VI-NEXT: v_readfirstlane_b32 s76, v0
+; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: v_readfirstlane_b32 s6, v30
; VI-NEXT: v_readfirstlane_b32 s7, v29
; VI-NEXT: v_readfirstlane_b32 s8, v28
@@ -120893,7 +121437,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s73, v3
; VI-NEXT: v_readfirstlane_b32 s74, v2
; VI-NEXT: v_readfirstlane_b32 s75, v1
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -121726,16 +122270,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-LABEL: bitcast_v128i8_to_v16f64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s45, v15
-; GFX9-NEXT: v_readfirstlane_b32 s46, v14
-; GFX9-NEXT: v_readfirstlane_b32 s47, v13
-; GFX9-NEXT: v_readfirstlane_b32 s76, v0
-; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -121752,6 +122286,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_readfirstlane_b32 s45, v15
+; GFX9-NEXT: v_readfirstlane_b32 s46, v14
+; GFX9-NEXT: v_readfirstlane_b32 s47, v13
+; GFX9-NEXT: v_readfirstlane_b32 s76, v0
+; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; GFX9-NEXT: v_readfirstlane_b32 s6, v30
; GFX9-NEXT: v_readfirstlane_b32 s7, v29
; GFX9-NEXT: v_readfirstlane_b32 s8, v28
@@ -121779,7 +122322,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s73, v3
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s75, v1
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -122622,6 +123165,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-TRUE16-NEXT: ; meta instruction
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308
@@ -122737,9 +123284,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s74, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s75, 0
-; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -123195,6 +123739,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-FAKE16-NEXT: ; meta instruction
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-FAKE16-NEXT: s_clause 0x1f
; GFX11-FAKE16-NEXT: scratch_load_u16 v32, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v34, off, s32 offset:308
@@ -123310,9 +123858,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s74, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s75, 0
-; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -123785,8 +124330,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f64_to_v64bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -123803,6 +124346,8 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr60
@@ -123834,7 +124379,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
@@ -123905,6 +124450,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB76_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -124036,6 +124582,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB76_4
; SI-NEXT: ; %bb.3: ; %cmp.true
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
@@ -124504,7 +125051,20 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -124540,8 +125100,8 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_writelane_b32 v63, s30, 34
-; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_writelane_b32 v63, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s44, v18
; SI-NEXT: v_readfirstlane_b32 s5, v17
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_readfirstlane_b32 s7, v15
@@ -124561,20 +125121,6 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: s_cmp_lg_u32 s44, 0
; SI-NEXT: v_readfirstlane_b32 s44, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB77_3
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -125141,8 +125687,8 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -125250,8 +125796,8 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -125351,36 +125897,36 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB77_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -125457,8 +126003,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64bf16_to_v16f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -125475,6 +126019,8 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0
; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v1
@@ -125559,8 +126105,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v54
; SI-NEXT: v_mul_f32_e32 v60, 1.0, v53
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v37
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38
; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -125645,7 +126193,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18
; SI-NEXT: v_mul_f32_e32 v58, 1.0, v52
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51
@@ -127272,26 +127819,41 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) #0 {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -128452,14 +129014,28 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
; SI-NEXT: v_writelane_b32 v63, s37, 3
; SI-NEXT: v_writelane_b32 v63, s30, 4
-; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: v_writelane_b32 v63, s31, 5
+; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: s_and_b32 s12, s25, 0xffff0000
; SI-NEXT: s_and_b32 s30, vcc_lo, 0xffff0000
; SI-NEXT: s_lshl_b32 s31, vcc_lo, 16
@@ -128594,27 +129170,11 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s43
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v53, 1.0, s40
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s29
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s28
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v62, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s26
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25
@@ -129292,8 +129852,8 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_writelane_b32 v35, s64, 12
; VI-NEXT: v_writelane_b32 v35, s65, 13
; VI-NEXT: v_writelane_b32 v35, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v35, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -130003,8 +130563,8 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX9-NEXT: v_writelane_b32 v36, s64, 12
; GFX9-NEXT: v_writelane_b32 v36, s65, 13
; GFX9-NEXT: v_writelane_b32 v36, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v36, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -130718,56 +131278,56 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 10
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 12
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 13
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 15
; GFX11-TRUE16-NEXT: s_mov_b32 s36, s0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s53, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s52, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s51, s27
+; GFX11-TRUE16-NEXT: s_mov_b32 s50, s26
+; GFX11-TRUE16-NEXT: s_mov_b32 s49, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s48, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23
; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21
; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20
; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19
; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-TRUE16-NEXT: s_mov_b32 s48, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17
; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16
; GFX11-TRUE16-NEXT: s_mov_b32 s39, s3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-TRUE16-NEXT: s_mov_b32 s49, s25
; GFX11-TRUE16-NEXT: s_mov_b32 s38, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s37, s1
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-TRUE16-NEXT: s_mov_b32 s50, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-TRUE16-NEXT: s_mov_b32 s51, s27
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 8
-; GFX11-TRUE16-NEXT: s_mov_b32 s52, s28
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 9
-; GFX11-TRUE16-NEXT: s_mov_b32 s53, s29
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 10
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 11
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 12
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 13
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 14
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 15
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -131444,56 +132004,56 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 12
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 13
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 15
; GFX11-FAKE16-NEXT: s_mov_b32 s36, s0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s53, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s52, s28
+; GFX11-FAKE16-NEXT: s_mov_b32 s51, s27
+; GFX11-FAKE16-NEXT: s_mov_b32 s50, s26
+; GFX11-FAKE16-NEXT: s_mov_b32 s49, s25
+; GFX11-FAKE16-NEXT: s_mov_b32 s48, s24
; GFX11-FAKE16-NEXT: s_mov_b32 s47, s23
; GFX11-FAKE16-NEXT: s_mov_b32 s46, s22
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
; GFX11-FAKE16-NEXT: s_mov_b32 s45, s21
; GFX11-FAKE16-NEXT: s_mov_b32 s44, s20
; GFX11-FAKE16-NEXT: s_mov_b32 s43, s19
; GFX11-FAKE16-NEXT: s_mov_b32 s42, s18
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-FAKE16-NEXT: s_mov_b32 s48, s24
; GFX11-FAKE16-NEXT: s_mov_b32 s41, s17
; GFX11-FAKE16-NEXT: s_mov_b32 s40, s16
; GFX11-FAKE16-NEXT: s_mov_b32 s39, s3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-FAKE16-NEXT: s_mov_b32 s49, s25
; GFX11-FAKE16-NEXT: s_mov_b32 s38, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s37, s1
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-FAKE16-NEXT: s_mov_b32 s50, s26
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-FAKE16-NEXT: s_mov_b32 s51, s27
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 8
-; GFX11-FAKE16-NEXT: s_mov_b32 s52, s28
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 9
-; GFX11-FAKE16-NEXT: s_mov_b32 s53, s29
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 10
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 11
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 12
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 13
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 14
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 15
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -132214,8 +132774,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f64_to_v64f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -132232,6 +132790,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr46
@@ -132263,13 +132823,14 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB80_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16
@@ -132327,6 +132888,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
@@ -132470,6 +133032,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53
; SI-NEXT: v_or_b32_e32 v30, v30, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51
; SI-NEXT: v_or_b32_e32 v9, v9, v52
@@ -132614,7 +133177,21 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -132636,8 +133213,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_writelane_b32 v63, s68, 18
; SI-NEXT: v_writelane_b32 v63, s69, 19
; SI-NEXT: v_writelane_b32 v63, s30, 20
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v63, s31, 21
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_readfirstlane_b32 s45, v17
; SI-NEXT: v_readfirstlane_b32 s44, v16
; SI-NEXT: v_readfirstlane_b32 s43, v15
@@ -132657,21 +133234,6 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_readfirstlane_b32 s5, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB81_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s69, s45, 16
@@ -133023,8 +133585,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -133132,8 +133694,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -133233,36 +133795,36 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB81_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -134337,12 +134899,12 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; SI-NEXT: v_writelane_b32 v34, s97, 31
; SI-NEXT: v_writelane_b32 v34, s98, 32
; SI-NEXT: v_writelane_b32 v34, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: v_writelane_b32 v34, s30, 34
+; SI-NEXT: v_writelane_b32 v34, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s6, v17
; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16
; SI-NEXT: v_readfirstlane_b32 s8, v16
; SI-NEXT: ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v34, s31, 35
; SI-NEXT: s_lshr_b32 vcc_hi, s8, 16
; SI-NEXT: v_readfirstlane_b32 s10, v15
; SI-NEXT: v_readfirstlane_b32 s12, v14
@@ -134859,8 +135421,8 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; VI-NEXT: v_writelane_b32 v33, s64, 12
; VI-NEXT: v_writelane_b32 v33, s65, 13
; VI-NEXT: v_writelane_b32 v33, s66, 14
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v33, s67, 15
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -135139,8 +135701,8 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -135277,56 +135839,56 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -135427,8 +135989,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f64_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -135445,6 +136005,8 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr46
@@ -135476,13 +136038,14 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB84_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16
; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16
@@ -135540,6 +136103,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16
@@ -135683,6 +136247,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53
; SI-NEXT: v_or_b32_e32 v30, v30, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51
; SI-NEXT: v_or_b32_e32 v9, v9, v52
@@ -135827,7 +136392,21 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -135849,8 +136428,8 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v63, s68, 18
; SI-NEXT: v_writelane_b32 v63, s69, 19
; SI-NEXT: v_writelane_b32 v63, s30, 20
-; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v63, s31, 21
+; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_readfirstlane_b32 s45, v17
; SI-NEXT: v_readfirstlane_b32 s44, v16
; SI-NEXT: v_readfirstlane_b32 s43, v15
@@ -135870,21 +136449,6 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s5, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB85_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s69, s45, 16
@@ -136236,8 +136800,8 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -136345,8 +136909,8 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -136446,36 +137010,36 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB85_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -137380,12 +137944,12 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s97, 31
; SI-NEXT: v_writelane_b32 v32, s98, 32
; SI-NEXT: v_writelane_b32 v32, s99, 33
-; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: v_writelane_b32 v32, s30, 34
+; SI-NEXT: v_writelane_b32 v32, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: s_lshr_b32 s14, s9, 16
; SI-NEXT: v_readfirstlane_b32 s13, v14
; SI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v32, s31, 35
; SI-NEXT: v_readfirstlane_b32 s7, v17
; SI-NEXT: v_readfirstlane_b32 s11, v15
; SI-NEXT: s_lshr_b32 s72, s13, 16
@@ -138036,8 +138600,8 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: v_writelane_b32 v32, s66, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s67, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -138173,56 +138737,56 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
+; GFX11-NEXT: v_writelane_b32 v32, s66, 14
+; GFX11-NEXT: v_writelane_b32 v32, s67, 15
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s67, v13
+; GFX11-NEXT: v_readfirstlane_b32 s66, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
-; GFX11-NEXT: v_writelane_b32 v32, s66, 14
-; GFX11-NEXT: v_readfirstlane_b32 s66, v12
-; GFX11-NEXT: v_writelane_b32 v32, s67, 15
-; GFX11-NEXT: v_readfirstlane_b32 s67, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB87_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -142911,29 +143475,53 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x18 ; 100-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:488
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:484
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:480
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:476
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:472
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:468
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:464
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:460
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:456
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:452
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:448
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:444
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:440
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:436
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:432
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:428
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:424
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:420
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:416
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:412
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:408
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:404
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:400
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:396
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
@@ -143691,53 +144279,99 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, v24 :: v_dual_mov_b32 v51, v19
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, v23 :: v_dual_mov_b32 v71, v20
@@ -144574,6 +145208,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
+; SI-NEXT: v_writelane_b32 v40, s96, 30
+; SI-NEXT: v_writelane_b32 v40, s97, 31
+; SI-NEXT: v_writelane_b32 v40, s98, 32
+; SI-NEXT: v_writelane_b32 v40, s99, 33
+; SI-NEXT: v_writelane_b32 v40, s30, 34
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s4, v30
; SI-NEXT: s_waitcnt expcnt(0)
@@ -144593,25 +145264,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s4, v27
; SI-NEXT: v_writelane_b32 v43, s4, 11
; SI-NEXT: v_readfirstlane_b32 s4, v26
-; SI-NEXT: v_writelane_b32 v40, s34, 0
; SI-NEXT: v_writelane_b32 v43, s4, 12
; SI-NEXT: v_readfirstlane_b32 s4, v25
-; SI-NEXT: v_writelane_b32 v40, s35, 1
; SI-NEXT: v_writelane_b32 v43, s4, 13
; SI-NEXT: v_readfirstlane_b32 s4, v24
-; SI-NEXT: v_writelane_b32 v40, s36, 2
; SI-NEXT: v_writelane_b32 v43, s4, 14
; SI-NEXT: v_readfirstlane_b32 s4, v23
-; SI-NEXT: v_writelane_b32 v40, s37, 3
; SI-NEXT: v_writelane_b32 v43, s4, 15
; SI-NEXT: v_readfirstlane_b32 s4, v22
-; SI-NEXT: v_writelane_b32 v40, s38, 4
; SI-NEXT: v_writelane_b32 v43, s4, 16
; SI-NEXT: v_readfirstlane_b32 s4, v21
-; SI-NEXT: v_writelane_b32 v40, s39, 5
; SI-NEXT: v_writelane_b32 v43, s4, 17
; SI-NEXT: v_readfirstlane_b32 s4, v20
-; SI-NEXT: v_writelane_b32 v40, s48, 6
; SI-NEXT: v_writelane_b32 v43, s4, 18
; SI-NEXT: v_readfirstlane_b32 s4, v19
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:328
@@ -144639,10 +145303,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:236
-; SI-NEXT: v_writelane_b32 v40, s49, 7
-; SI-NEXT: v_writelane_b32 v40, s50, 8
-; SI-NEXT: v_writelane_b32 v40, s51, 9
-; SI-NEXT: v_writelane_b32 v40, s52, 10
; SI-NEXT: s_mov_b32 s37, s18
; SI-NEXT: v_readfirstlane_b32 s18, v18
; SI-NEXT: v_readfirstlane_b32 s38, v17
@@ -144652,10 +145312,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:228
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v40, s53, 11
-; SI-NEXT: v_writelane_b32 v40, s54, 12
-; SI-NEXT: v_writelane_b32 v40, s55, 13
-; SI-NEXT: v_writelane_b32 v40, s64, 14
; SI-NEXT: s_mov_b32 s73, s19
; SI-NEXT: v_readfirstlane_b32 s55, v14
; SI-NEXT: v_readfirstlane_b32 s34, v13
@@ -144679,32 +145335,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176
-; SI-NEXT: v_writelane_b32 v40, s65, 15
-; SI-NEXT: v_writelane_b32 v40, s66, 16
; SI-NEXT: v_writelane_b32 v43, s4, 19
-; SI-NEXT: v_writelane_b32 v40, s67, 17
-; SI-NEXT: v_writelane_b32 v40, s68, 18
-; SI-NEXT: v_writelane_b32 v40, s69, 19
-; SI-NEXT: v_writelane_b32 v40, s70, 20
-; SI-NEXT: v_writelane_b32 v40, s71, 21
-; SI-NEXT: v_writelane_b32 v40, s80, 22
-; SI-NEXT: v_writelane_b32 v40, s81, 23
-; SI-NEXT: v_writelane_b32 v40, s82, 24
-; SI-NEXT: v_writelane_b32 v40, s83, 25
-; SI-NEXT: v_writelane_b32 v40, s84, 26
-; SI-NEXT: v_writelane_b32 v40, s85, 27
-; SI-NEXT: v_writelane_b32 v40, s86, 28
-; SI-NEXT: v_writelane_b32 v40, s87, 29
-; SI-NEXT: v_writelane_b32 v40, s96, 30
-; SI-NEXT: v_writelane_b32 v40, s97, 31
-; SI-NEXT: v_writelane_b32 v40, s98, 32
-; SI-NEXT: v_writelane_b32 v40, s99, 33
-; SI-NEXT: v_writelane_b32 v40, s30, 34
-; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: v_readfirstlane_b32 s53, v6
; SI-NEXT: v_readfirstlane_b32 s76, v5
; SI-NEXT: v_readfirstlane_b32 s77, v4
; SI-NEXT: v_readfirstlane_b32 s48, v3
+; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT: s_mov_b32 s6, s20
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s91, v19
; SI-NEXT: v_readfirstlane_b32 s43, v20
@@ -144748,8 +145385,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:120
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112
-; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT: s_mov_b32 s6, s20
; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: v_writelane_b32 v43, s4, 21
; SI-NEXT: s_waitcnt vmcnt(14)
@@ -144759,14 +145394,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v43, s4, 23
; SI-NEXT: v_readfirstlane_b32 s4, v15
; SI-NEXT: v_writelane_b32 v43, s4, 24
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v43, s4, 25
; SI-NEXT: v_readfirstlane_b32 s4, v13
@@ -144806,6 +145433,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80
; SI-NEXT: v_writelane_b32 v43, s4, 33
; SI-NEXT: v_writelane_b32 v43, s37, 34
; SI-NEXT: v_writelane_b32 v43, s73, 35
@@ -144850,21 +145485,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s10, v28
; SI-NEXT: v_readfirstlane_b32 s8, v29
; SI-NEXT: v_readfirstlane_b32 s12, v30
-; SI-NEXT: v_readfirstlane_b32 s41, v15
-; SI-NEXT: v_readfirstlane_b32 s11, v16
-; SI-NEXT: v_readfirstlane_b32 s27, v17
-; SI-NEXT: v_readfirstlane_b32 s28, v18
-; SI-NEXT: v_readfirstlane_b32 s13, v31
-; SI-NEXT: v_readfirstlane_b32 s99, v32
-; SI-NEXT: v_readfirstlane_b32 s98, v33
-; SI-NEXT: v_readfirstlane_b32 s9, v34
; SI-NEXT: v_readfirstlane_b32 s85, v7
; SI-NEXT: v_readfirstlane_b32 s71, v8
; SI-NEXT: v_readfirstlane_b32 s70, v9
; SI-NEXT: v_readfirstlane_b32 s80, v10
; SI-NEXT: v_readfirstlane_b32 s81, v39
; SI-NEXT: v_readfirstlane_b32 s67, v48
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s24, v11
; SI-NEXT: v_readfirstlane_b32 s68, v50
; SI-NEXT: v_writelane_b32 v42, s68, 0
@@ -144873,12 +145499,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v42, s70, 2
; SI-NEXT: v_writelane_b32 v42, s71, 3
; SI-NEXT: v_writelane_b32 v42, s80, 4
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s82, v37
; SI-NEXT: v_writelane_b32 v42, s81, 5
; SI-NEXT: v_readfirstlane_b32 s83, v36
; SI-NEXT: v_writelane_b32 v42, s82, 6
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s84, v38
; SI-NEXT: v_writelane_b32 v42, s83, 7
; SI-NEXT: v_writelane_b32 v42, s84, 8
@@ -144890,17 +145516,27 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v42, s87, 11
; SI-NEXT: v_readfirstlane_b32 s97, v35
; SI-NEXT: v_writelane_b32 v42, s96, 12
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_readfirstlane_b32 s98, v33
; SI-NEXT: v_writelane_b32 v42, s97, 13
+; SI-NEXT: v_readfirstlane_b32 s99, v32
; SI-NEXT: v_writelane_b32 v42, s98, 14
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_readfirstlane_b32 s9, v34
; SI-NEXT: v_writelane_b32 v42, s99, 15
; SI-NEXT: v_writelane_b32 v42, s9, 16
+; SI-NEXT: v_readfirstlane_b32 s27, v17
; SI-NEXT: v_writelane_b32 v42, s24, 17
+; SI-NEXT: v_readfirstlane_b32 s11, v16
; SI-NEXT: v_writelane_b32 v42, s27, 18
+; SI-NEXT: v_readfirstlane_b32 s28, v18
; SI-NEXT: v_writelane_b32 v42, s11, 19
+; SI-NEXT: v_readfirstlane_b32 s13, v31
; SI-NEXT: v_writelane_b32 v42, s28, 20
; SI-NEXT: v_writelane_b32 v42, s13, 21
; SI-NEXT: v_writelane_b32 v42, s8, 22
; SI-NEXT: v_writelane_b32 v42, s10, 23
+; SI-NEXT: v_readfirstlane_b32 s41, v15
; SI-NEXT: v_writelane_b32 v42, s12, 24
; SI-NEXT: v_writelane_b32 v42, s41, 25
; SI-NEXT: v_writelane_b32 v42, s45, 26
@@ -146126,10 +146762,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s63, v12
-; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:280
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -146146,6 +146778,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_readfirstlane_b32 s63, v12
+; VI-NEXT: v_readfirstlane_b32 s8, v0
+; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:280
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
; VI-NEXT: v_readfirstlane_b32 s45, v11
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:12
@@ -147015,19 +147651,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s46, v12
-; GFX9-NEXT: v_readfirstlane_b32 s61, v11
-; GFX9-NEXT: v_readfirstlane_b32 s63, v10
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; GFX9-NEXT: v_readfirstlane_b32 s43, v2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v5
-; GFX9-NEXT: v_readfirstlane_b32 s10, v4
-; GFX9-NEXT: v_readfirstlane_b32 s15, v3
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -147044,6 +147667,19 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_readfirstlane_b32 s46, v12
+; GFX9-NEXT: v_readfirstlane_b32 s61, v11
+; GFX9-NEXT: v_readfirstlane_b32 s63, v10
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; GFX9-NEXT: v_readfirstlane_b32 s43, v2
+; GFX9-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v5
+; GFX9-NEXT: v_readfirstlane_b32 s10, v4
+; GFX9-NEXT: v_readfirstlane_b32 s15, v3
; GFX9-NEXT: v_readfirstlane_b32 s12, v9
; GFX9-NEXT: v_readfirstlane_b32 s14, v8
; GFX9-NEXT: v_readfirstlane_b32 s47, v7
@@ -147067,7 +147703,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s58, v16
; GFX9-NEXT: v_readfirstlane_b32 s73, v15
; GFX9-NEXT: v_readfirstlane_b32 s75, v14
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -147933,6 +148569,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-TRUE16-NEXT: ; meta instruction
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:308
@@ -148048,9 +148688,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s75, 0
-; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -148451,6 +149088,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:320 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_clause 0x1f
; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:308
@@ -148566,7 +149204,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s75, 0
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:320 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -149053,16 +149690,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64bf16_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
-; SI-NEXT: v_mov_b32_e32 v31, v0
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -149079,7 +149706,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
+; SI-NEXT: v_mov_b32_e32 v31, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -149174,9 +149810,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v33
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v36
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39
; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
@@ -149314,8 +149954,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: v_mul_f32_e32 v51, 1.0, v63
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v58
; SI-NEXT: ; kill: killed $vgpr1
@@ -151098,6 +151736,22 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v64bf16_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -151120,22 +151774,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: ; kill: killed $vgpr34
; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; implicit-def: $vgpr35
@@ -151163,7 +151801,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: ; implicit-def: $vgpr38
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: ; kill: killed $vgpr33
@@ -151353,6 +151991,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -151762,6 +152401,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
; VI-NEXT: v_alignbit_b32 v32, v32, v33, 16
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31
; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
@@ -154178,69 +154818,124 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x2
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v70, off, s32
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:236
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:232
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:228
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:224
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:220
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:216
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:212
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:208
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:204
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:200
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:196
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:192
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:188
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:184
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:180
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:176
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:172
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:168
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:164
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:160
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:156
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:152
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:148
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:144
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:140
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:136
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:132
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:128
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:124
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:120
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:116
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:112
; GFX11-TRUE16-NEXT: s_clause 0x18 ; 100-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:108
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:104
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:100
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:96
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:92
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:88
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:84
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:80
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:76
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:72
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:68
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:64
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:60
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:56
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:52
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:48
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:44
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:40
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:36
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:32
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:28
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:24
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:20
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:16
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:12
+; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v70, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
@@ -155299,34 +155994,56 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) #0 {
; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: s_clause 0x16 ; 92-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:100
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:96
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:92
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:88
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:84
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:80
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:76
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:72
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:68
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:64
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:60
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:56
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:52
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:48
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:44
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:40
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:36
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:32
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:28
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:24
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:20
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:16
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:12
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -156309,7 +157026,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -156343,16 +157072,16 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_writelane_b32 v63, s96, 30
; SI-NEXT: v_writelane_b32 v63, s97, 31
; SI-NEXT: v_writelane_b32 v63, s98, 32
-; SI-NEXT: v_readfirstlane_b32 s56, v15
; SI-NEXT: v_writelane_b32 v63, s99, 33
-; SI-NEXT: s_and_b32 s57, s56, 0xffff0000
; SI-NEXT: v_writelane_b32 v63, s30, 34
+; SI-NEXT: v_writelane_b32 v63, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s56, v15
+; SI-NEXT: s_and_b32 s57, s56, 0xffff0000
; SI-NEXT: v_readfirstlane_b32 s46, v16
; SI-NEXT: s_lshl_b32 s56, s56, 16
; SI-NEXT: v_readfirstlane_b32 s78, v8
; SI-NEXT: v_readfirstlane_b32 vcc_lo, v3
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s57
-; SI-NEXT: v_writelane_b32 v63, s31, 35
; SI-NEXT: v_readfirstlane_b32 s44, v17
; SI-NEXT: s_and_b32 s47, s46, 0xffff0000
; SI-NEXT: s_lshl_b32 s46, s46, 16
@@ -156439,19 +157168,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s44
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16
@@ -156475,9 +157191,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mul_f32_e64 v33, 1.0, s13
; SI-NEXT: v_mul_f32_e64 v35, 1.0, s10
; SI-NEXT: v_mul_f32_e64 v49, 1.0, s11
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_mul_f32_e64 v56, 1.0, s8
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v60, 1.0, s9
; SI-NEXT: v_mul_f32_e64 v54, 1.0, s6
; SI-NEXT: v_mul_f32_e64 v40, 1.0, s7
@@ -158277,6 +158991,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v63, s34, 0
; VI-NEXT: v_writelane_b32 v63, s35, 1
; VI-NEXT: v_writelane_b32 v63, s36, 2
@@ -158308,8 +159036,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_writelane_b32 v63, s86, 28
; VI-NEXT: v_writelane_b32 v63, s87, 29
; VI-NEXT: v_writelane_b32 v63, s30, 30
-; VI-NEXT: v_readfirstlane_b32 s4, v19
; VI-NEXT: v_writelane_b32 v63, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s4, v19
; VI-NEXT: v_readfirstlane_b32 s7, v18
; VI-NEXT: v_readfirstlane_b32 s6, v17
; VI-NEXT: v_readfirstlane_b32 s9, v16
@@ -158329,20 +159057,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB91_3
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -160135,6 +160849,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -160170,8 +160898,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_writelane_b32 v63, s98, 32
; GFX9-NEXT: v_writelane_b32 v63, s99, 33
; GFX9-NEXT: v_writelane_b32 v63, s30, 34
-; GFX9-NEXT: v_readfirstlane_b32 s4, v19
; GFX9-NEXT: v_writelane_b32 v63, s31, 35
+; GFX9-NEXT: v_readfirstlane_b32 s4, v19
; GFX9-NEXT: v_readfirstlane_b32 s7, v18
; GFX9-NEXT: v_readfirstlane_b32 s6, v17
; GFX9-NEXT: v_readfirstlane_b32 s9, v16
@@ -160191,20 +160919,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB91_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -161916,62 +162630,51 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s34, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s98, 0
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v15
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s35, 1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s99, 1
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s36, 2
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s100, 2
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s37, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s101, 3
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s38, 4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s102, 4
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s39, 5
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s103, 5
-; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s42, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-TRUE16-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:4
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s39, 5
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s48, 6
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s104, 6
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s49, 7
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s50, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s31, 8
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s51, 9
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s52, 10
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s53, 11
@@ -161995,6 +162698,34 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s87, 29
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s96, 30
; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s97, 31
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s98, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s99, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s100, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s101, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s102, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s103, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s104, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s30, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v15
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s42, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB91_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24
@@ -163349,62 +164080,51 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s34, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s98, 0
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s42, v15
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s35, 1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s99, 1
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s36, 2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s100, 2
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s37, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s101, 3
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s38, 4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s102, 4
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s39, 5
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s103, 5
-; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s42, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-FAKE16-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:4
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s39, 5
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s48, 6
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s104, 6
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s49, 7
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s50, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s51, 9
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s52, 10
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s53, 11
@@ -163428,6 +164148,34 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s87, 29
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s96, 30
; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s97, 31
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s98, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s99, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s100, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s101, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s102, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s103, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s104, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s42, v15
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s42, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB91_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24
@@ -169395,29 +170143,53 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x18 ; 100-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:488
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:484
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:480
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:476
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:472
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:468
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:464
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:460
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:456
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:452
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:448
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:444
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:440
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:436
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:432
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:428
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:424
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:420
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:416
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:412
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:408
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:404
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:400
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:396
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
@@ -170175,53 +170947,99 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, v24 :: v_dual_mov_b32 v51, v19
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, v23 :: v_dual_mov_b32 v71, v20
@@ -171058,9 +171876,45 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
+; SI-NEXT: v_writelane_b32 v40, s96, 30
+; SI-NEXT: v_writelane_b32 v40, s97, 31
+; SI-NEXT: v_writelane_b32 v40, s98, 32
+; SI-NEXT: v_writelane_b32 v40, s99, 33
+; SI-NEXT: v_writelane_b32 v40, s30, 34
+; SI-NEXT: v_writelane_b32 v40, s31, 35
+; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; SI-NEXT: v_readfirstlane_b32 s4, v30
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s29, 0
; SI-NEXT: v_writelane_b32 v43, s28, 1
@@ -171069,61 +171923,39 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_writelane_b32 v43, s25, 4
; SI-NEXT: v_writelane_b32 v43, s24, 5
; SI-NEXT: v_writelane_b32 v43, s23, 6
-; SI-NEXT: v_writelane_b32 v40, s35, 1
; SI-NEXT: v_writelane_b32 v43, s22, 7
-; SI-NEXT: v_writelane_b32 v40, s36, 2
; SI-NEXT: v_writelane_b32 v43, s21, 8
-; SI-NEXT: v_writelane_b32 v40, s37, 3
; SI-NEXT: v_writelane_b32 v43, s20, 9
-; SI-NEXT: v_writelane_b32 v40, s38, 4
; SI-NEXT: v_writelane_b32 v43, s19, 10
-; SI-NEXT: v_writelane_b32 v40, s39, 5
; SI-NEXT: v_writelane_b32 v43, s18, 11
-; SI-NEXT: v_writelane_b32 v40, s48, 6
; SI-NEXT: v_writelane_b32 v43, s17, 12
-; SI-NEXT: v_writelane_b32 v40, s49, 7
; SI-NEXT: v_writelane_b32 v43, s16, 13
-; SI-NEXT: v_readfirstlane_b32 s4, v30
-; SI-NEXT: v_writelane_b32 v40, s50, 8
; SI-NEXT: v_writelane_b32 v43, s4, 14
; SI-NEXT: v_readfirstlane_b32 s4, v29
-; SI-NEXT: v_writelane_b32 v40, s51, 9
; SI-NEXT: v_writelane_b32 v43, s4, 15
; SI-NEXT: v_readfirstlane_b32 s4, v28
-; SI-NEXT: v_writelane_b32 v40, s52, 10
; SI-NEXT: v_writelane_b32 v43, s4, 16
; SI-NEXT: v_readfirstlane_b32 s4, v27
-; SI-NEXT: v_writelane_b32 v40, s53, 11
; SI-NEXT: v_writelane_b32 v43, s4, 17
; SI-NEXT: v_readfirstlane_b32 s4, v26
-; SI-NEXT: v_writelane_b32 v40, s54, 12
; SI-NEXT: v_writelane_b32 v43, s4, 18
; SI-NEXT: v_readfirstlane_b32 s4, v25
-; SI-NEXT: v_writelane_b32 v40, s55, 13
; SI-NEXT: v_writelane_b32 v43, s4, 19
; SI-NEXT: v_readfirstlane_b32 s4, v24
-; SI-NEXT: v_writelane_b32 v40, s64, 14
; SI-NEXT: v_writelane_b32 v43, s4, 20
; SI-NEXT: v_readfirstlane_b32 s4, v23
-; SI-NEXT: v_writelane_b32 v40, s65, 15
; SI-NEXT: v_writelane_b32 v43, s4, 21
; SI-NEXT: v_readfirstlane_b32 s4, v22
-; SI-NEXT: v_writelane_b32 v40, s66, 16
; SI-NEXT: v_writelane_b32 v43, s4, 22
; SI-NEXT: v_readfirstlane_b32 s4, v21
-; SI-NEXT: v_writelane_b32 v40, s67, 17
; SI-NEXT: v_writelane_b32 v43, s4, 23
; SI-NEXT: v_readfirstlane_b32 s4, v20
-; SI-NEXT: v_writelane_b32 v40, s68, 18
; SI-NEXT: v_writelane_b32 v43, s4, 24
; SI-NEXT: v_readfirstlane_b32 s4, v19
-; SI-NEXT: v_writelane_b32 v40, s69, 19
; SI-NEXT: v_writelane_b32 v43, s4, 25
; SI-NEXT: v_readfirstlane_b32 s4, v17
-; SI-NEXT: v_writelane_b32 v40, s70, 20
; SI-NEXT: v_writelane_b32 v43, s4, 26
; SI-NEXT: v_readfirstlane_b32 s4, v16
-; SI-NEXT: v_writelane_b32 v40, s71, 21
; SI-NEXT: v_readfirstlane_b32 s71, v18
; SI-NEXT: v_writelane_b32 v43, s4, 27
; SI-NEXT: v_readfirstlane_b32 s4, v15
@@ -171191,22 +172023,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188
-; SI-NEXT: v_writelane_b32 v40, s80, 22
-; SI-NEXT: v_writelane_b32 v40, s81, 23
-; SI-NEXT: v_writelane_b32 v40, s82, 24
-; SI-NEXT: v_writelane_b32 v40, s83, 25
-; SI-NEXT: v_writelane_b32 v40, s84, 26
; SI-NEXT: v_writelane_b32 v43, s4, 38
-; SI-NEXT: v_writelane_b32 v40, s85, 27
-; SI-NEXT: v_writelane_b32 v40, s86, 28
-; SI-NEXT: v_writelane_b32 v40, s87, 29
-; SI-NEXT: v_writelane_b32 v40, s96, 30
-; SI-NEXT: v_writelane_b32 v40, s97, 31
-; SI-NEXT: v_writelane_b32 v40, s98, 32
; SI-NEXT: v_readfirstlane_b32 s28, v3
-; SI-NEXT: v_writelane_b32 v40, s99, 33
-; SI-NEXT: v_writelane_b32 v40, s30, 34
-; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s56, v19
@@ -172653,10 +173471,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-LABEL: bitcast_v128i8_to_v64f16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s63, v12
-; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:280
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -172673,6 +173487,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_readfirstlane_b32 s63, v12
+; VI-NEXT: v_readfirstlane_b32 s8, v0
+; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:280
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
; VI-NEXT: v_readfirstlane_b32 s45, v11
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:12
@@ -173542,19 +174360,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-LABEL: bitcast_v128i8_to_v64f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s46, v12
-; GFX9-NEXT: v_readfirstlane_b32 s61, v11
-; GFX9-NEXT: v_readfirstlane_b32 s63, v10
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; GFX9-NEXT: v_readfirstlane_b32 s43, v2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v5
-; GFX9-NEXT: v_readfirstlane_b32 s10, v4
-; GFX9-NEXT: v_readfirstlane_b32 s15, v3
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -173571,6 +174376,19 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_readfirstlane_b32 s46, v12
+; GFX9-NEXT: v_readfirstlane_b32 s61, v11
+; GFX9-NEXT: v_readfirstlane_b32 s63, v10
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; GFX9-NEXT: v_readfirstlane_b32 s43, v2
+; GFX9-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v5
+; GFX9-NEXT: v_readfirstlane_b32 s10, v4
+; GFX9-NEXT: v_readfirstlane_b32 s15, v3
; GFX9-NEXT: v_readfirstlane_b32 s12, v9
; GFX9-NEXT: v_readfirstlane_b32 s14, v8
; GFX9-NEXT: v_readfirstlane_b32 s47, v7
@@ -173594,7 +174412,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s58, v16
; GFX9-NEXT: v_readfirstlane_b32 s73, v15
; GFX9-NEXT: v_readfirstlane_b32 s75, v14
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -174460,6 +175278,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-TRUE16-NEXT: ; meta instruction
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:308
@@ -174575,9 +175397,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s75, 0
-; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -174978,6 +175797,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:320 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_clause 0x1f
; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:308
@@ -175093,7 +175913,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s75, 0
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:320 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -178560,6 +179379,22 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v64f16_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -178677,23 +179512,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: ; kill: killed $vgpr48
; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; kill: killed $vgpr34
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: ; implicit-def: $vgpr56
@@ -178730,6 +179548,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
@@ -178753,7 +179572,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -178781,7 +179600,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13
@@ -178942,7 +179761,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) #0 {
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: s_movk_i32 s6, 0x200
; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: s_waitcnt vmcnt(12)
; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0]
@@ -179822,32 +180641,52 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) #0 {
; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: s_clause 0x14 ; 84-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:92
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:88
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:84
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:80
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:76
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:72
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:68
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:64
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:60
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:56
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:52
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:48
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:44
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:40
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:36
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:32
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:28
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:24
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:20
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:16
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:12
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -180374,25 +181213,25 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v34, s85, 27
; SI-NEXT: v_writelane_b32 v34, s86, 28
; SI-NEXT: v_writelane_b32 v34, s87, 29
+; SI-NEXT: v_writelane_b32 v34, s96, 30
+; SI-NEXT: v_writelane_b32 v34, s97, 31
+; SI-NEXT: v_writelane_b32 v34, s98, 32
+; SI-NEXT: v_writelane_b32 v34, s99, 33
+; SI-NEXT: v_writelane_b32 v34, s30, 34
+; SI-NEXT: v_writelane_b32 v34, s31, 35
; SI-NEXT: s_lshr_b32 s5, s16, 16
; SI-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v34, s96, 30
; SI-NEXT: s_lshr_b32 s6, s18, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v37, s5, 0
-; SI-NEXT: v_writelane_b32 v34, s97, 31
; SI-NEXT: s_lshr_b32 s7, s20, 16
; SI-NEXT: v_writelane_b32 v37, s6, 2
-; SI-NEXT: v_writelane_b32 v34, s98, 32
; SI-NEXT: s_lshr_b32 s8, s22, 16
; SI-NEXT: v_writelane_b32 v37, s7, 4
-; SI-NEXT: v_writelane_b32 v34, s99, 33
; SI-NEXT: s_lshr_b32 s9, s24, 16
; SI-NEXT: v_writelane_b32 v37, s8, 5
-; SI-NEXT: v_writelane_b32 v34, s30, 34
; SI-NEXT: s_lshr_b32 s10, s26, 16
; SI-NEXT: v_writelane_b32 v37, s9, 6
-; SI-NEXT: v_writelane_b32 v34, s31, 35
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: v_readfirstlane_b32 s31, v3
; SI-NEXT: v_writelane_b32 v37, s10, 7
@@ -182018,6 +182857,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v63, s34, 0
; VI-NEXT: v_writelane_b32 v63, s35, 1
; VI-NEXT: v_writelane_b32 v63, s36, 2
@@ -182049,8 +182902,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v63, s86, 28
; VI-NEXT: v_writelane_b32 v63, s87, 29
; VI-NEXT: v_writelane_b32 v63, s30, 30
-; VI-NEXT: v_readfirstlane_b32 s4, v19
; VI-NEXT: v_writelane_b32 v63, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s4, v19
; VI-NEXT: v_readfirstlane_b32 s7, v18
; VI-NEXT: v_readfirstlane_b32 s6, v17
; VI-NEXT: v_readfirstlane_b32 s9, v16
@@ -182070,20 +182923,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB95_3
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -183327,6 +184166,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -183362,8 +184215,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v63, s98, 32
; GFX9-NEXT: v_writelane_b32 v63, s99, 33
; GFX9-NEXT: v_writelane_b32 v63, s30, 34
-; GFX9-NEXT: v_readfirstlane_b32 s4, v19
; GFX9-NEXT: v_writelane_b32 v63, s31, 35
+; GFX9-NEXT: v_readfirstlane_b32 s4, v19
; GFX9-NEXT: v_readfirstlane_b32 s7, v18
; GFX9-NEXT: v_readfirstlane_b32 s6, v17
; GFX9-NEXT: v_readfirstlane_b32 s9, v16
@@ -183383,20 +184236,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB95_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -184564,62 +185403,51 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v74, s34, 0
-; GFX11-NEXT: v_writelane_b32 v75, s98, 0
-; GFX11-NEXT: v_readfirstlane_b32 s42, v15
-; GFX11-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-NEXT: v_writelane_b32 v74, s35, 1
-; GFX11-NEXT: v_writelane_b32 v75, s99, 1
-; GFX11-NEXT: v_readfirstlane_b32 s7, v12
-; GFX11-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-NEXT: v_writelane_b32 v74, s36, 2
-; GFX11-NEXT: v_writelane_b32 v75, s100, 2
-; GFX11-NEXT: v_readfirstlane_b32 s8, v9
-; GFX11-NEXT: v_readfirstlane_b32 s11, v8
-; GFX11-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-NEXT: v_writelane_b32 v74, s37, 3
-; GFX11-NEXT: v_writelane_b32 v75, s101, 3
-; GFX11-NEXT: v_readfirstlane_b32 s13, v6
-; GFX11-NEXT: v_readfirstlane_b32 s12, v5
-; GFX11-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-NEXT: v_writelane_b32 v74, s38, 4
-; GFX11-NEXT: v_writelane_b32 v75, s102, 4
-; GFX11-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-NEXT: v_readfirstlane_b32 s41, v2
-; GFX11-NEXT: v_readfirstlane_b32 s40, v1
-; GFX11-NEXT: v_writelane_b32 v74, s39, 5
-; GFX11-NEXT: v_writelane_b32 v75, s103, 5
-; GFX11-NEXT: s_cmp_lg_u32 s42, 0
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v73, s32
+; GFX11-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-NEXT: v_writelane_b32 v74, s39, 5
; GFX11-NEXT: v_writelane_b32 v74, s48, 6
-; GFX11-NEXT: v_writelane_b32 v75, s104, 6
-; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: v_writelane_b32 v74, s49, 7
-; GFX11-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-NEXT: v_writelane_b32 v74, s50, 8
-; GFX11-NEXT: v_writelane_b32 v75, s31, 8
; GFX11-NEXT: v_writelane_b32 v74, s51, 9
; GFX11-NEXT: v_writelane_b32 v74, s52, 10
; GFX11-NEXT: v_writelane_b32 v74, s53, 11
@@ -184643,6 +185471,34 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_writelane_b32 v74, s87, 29
; GFX11-NEXT: v_writelane_b32 v74, s96, 30
; GFX11-NEXT: v_writelane_b32 v74, s97, 31
+; GFX11-NEXT: v_writelane_b32 v75, s98, 0
+; GFX11-NEXT: v_writelane_b32 v75, s99, 1
+; GFX11-NEXT: v_writelane_b32 v75, s100, 2
+; GFX11-NEXT: v_writelane_b32 v75, s101, 3
+; GFX11-NEXT: v_writelane_b32 v75, s102, 4
+; GFX11-NEXT: v_writelane_b32 v75, s103, 5
+; GFX11-NEXT: v_writelane_b32 v75, s104, 6
+; GFX11-NEXT: v_writelane_b32 v75, s30, 7
+; GFX11-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-NEXT: v_readfirstlane_b32 s42, v15
+; GFX11-NEXT: v_readfirstlane_b32 s5, v14
+; GFX11-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-NEXT: v_readfirstlane_b32 s6, v11
+; GFX11-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-NEXT: v_readfirstlane_b32 s11, v8
+; GFX11-NEXT: v_readfirstlane_b32 s10, v7
+; GFX11-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-NEXT: v_readfirstlane_b32 s12, v5
+; GFX11-NEXT: v_readfirstlane_b32 s15, v4
+; GFX11-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-NEXT: v_readfirstlane_b32 s41, v2
+; GFX11-NEXT: v_readfirstlane_b32 s40, v1
+; GFX11-NEXT: s_cmp_lg_u32 s42, 0
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
+; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 24
@@ -190021,29 +190877,53 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x18 ; 100-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:488
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:484
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:480
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:476
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:472
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:468
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:464
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:460
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:456
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:452
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:448
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:444
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:440
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:436
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:432
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:428
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:424
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:420
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:416
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:412
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:408
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:404
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:400
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:396
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
@@ -190801,53 +191681,99 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, v24 :: v_dual_mov_b32 v51, v19
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, v23 :: v_dual_mov_b32 v71, v20
@@ -191684,9 +192610,45 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
+; SI-NEXT: v_writelane_b32 v40, s96, 30
+; SI-NEXT: v_writelane_b32 v40, s97, 31
+; SI-NEXT: v_writelane_b32 v40, s98, 32
+; SI-NEXT: v_writelane_b32 v40, s99, 33
+; SI-NEXT: v_writelane_b32 v40, s30, 34
+; SI-NEXT: v_writelane_b32 v40, s31, 35
+; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; SI-NEXT: v_readfirstlane_b32 s4, v30
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s29, 0
; SI-NEXT: v_writelane_b32 v43, s28, 1
@@ -191695,61 +192657,39 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v43, s25, 4
; SI-NEXT: v_writelane_b32 v43, s24, 5
; SI-NEXT: v_writelane_b32 v43, s23, 6
-; SI-NEXT: v_writelane_b32 v40, s35, 1
; SI-NEXT: v_writelane_b32 v43, s22, 7
-; SI-NEXT: v_writelane_b32 v40, s36, 2
; SI-NEXT: v_writelane_b32 v43, s21, 8
-; SI-NEXT: v_writelane_b32 v40, s37, 3
; SI-NEXT: v_writelane_b32 v43, s20, 9
-; SI-NEXT: v_writelane_b32 v40, s38, 4
; SI-NEXT: v_writelane_b32 v43, s19, 10
-; SI-NEXT: v_writelane_b32 v40, s39, 5
; SI-NEXT: v_writelane_b32 v43, s18, 11
-; SI-NEXT: v_writelane_b32 v40, s48, 6
; SI-NEXT: v_writelane_b32 v43, s17, 12
-; SI-NEXT: v_writelane_b32 v40, s49, 7
; SI-NEXT: v_writelane_b32 v43, s16, 13
-; SI-NEXT: v_readfirstlane_b32 s4, v30
-; SI-NEXT: v_writelane_b32 v40, s50, 8
; SI-NEXT: v_writelane_b32 v43, s4, 14
; SI-NEXT: v_readfirstlane_b32 s4, v29
-; SI-NEXT: v_writelane_b32 v40, s51, 9
; SI-NEXT: v_writelane_b32 v43, s4, 15
; SI-NEXT: v_readfirstlane_b32 s4, v28
-; SI-NEXT: v_writelane_b32 v40, s52, 10
; SI-NEXT: v_writelane_b32 v43, s4, 16
; SI-NEXT: v_readfirstlane_b32 s4, v27
-; SI-NEXT: v_writelane_b32 v40, s53, 11
; SI-NEXT: v_writelane_b32 v43, s4, 17
; SI-NEXT: v_readfirstlane_b32 s4, v26
-; SI-NEXT: v_writelane_b32 v40, s54, 12
; SI-NEXT: v_writelane_b32 v43, s4, 18
; SI-NEXT: v_readfirstlane_b32 s4, v25
-; SI-NEXT: v_writelane_b32 v40, s55, 13
; SI-NEXT: v_writelane_b32 v43, s4, 19
; SI-NEXT: v_readfirstlane_b32 s4, v24
-; SI-NEXT: v_writelane_b32 v40, s64, 14
; SI-NEXT: v_writelane_b32 v43, s4, 20
; SI-NEXT: v_readfirstlane_b32 s4, v23
-; SI-NEXT: v_writelane_b32 v40, s65, 15
; SI-NEXT: v_writelane_b32 v43, s4, 21
; SI-NEXT: v_readfirstlane_b32 s4, v22
-; SI-NEXT: v_writelane_b32 v40, s66, 16
; SI-NEXT: v_writelane_b32 v43, s4, 22
; SI-NEXT: v_readfirstlane_b32 s4, v21
-; SI-NEXT: v_writelane_b32 v40, s67, 17
; SI-NEXT: v_writelane_b32 v43, s4, 23
; SI-NEXT: v_readfirstlane_b32 s4, v20
-; SI-NEXT: v_writelane_b32 v40, s68, 18
; SI-NEXT: v_writelane_b32 v43, s4, 24
; SI-NEXT: v_readfirstlane_b32 s4, v19
-; SI-NEXT: v_writelane_b32 v40, s69, 19
; SI-NEXT: v_writelane_b32 v43, s4, 25
; SI-NEXT: v_readfirstlane_b32 s4, v17
-; SI-NEXT: v_writelane_b32 v40, s70, 20
; SI-NEXT: v_writelane_b32 v43, s4, 26
; SI-NEXT: v_readfirstlane_b32 s4, v16
-; SI-NEXT: v_writelane_b32 v40, s71, 21
; SI-NEXT: v_readfirstlane_b32 s71, v18
; SI-NEXT: v_writelane_b32 v43, s4, 27
; SI-NEXT: v_readfirstlane_b32 s4, v15
@@ -191817,22 +192757,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188
-; SI-NEXT: v_writelane_b32 v40, s80, 22
-; SI-NEXT: v_writelane_b32 v40, s81, 23
-; SI-NEXT: v_writelane_b32 v40, s82, 24
-; SI-NEXT: v_writelane_b32 v40, s83, 25
-; SI-NEXT: v_writelane_b32 v40, s84, 26
; SI-NEXT: v_writelane_b32 v43, s4, 38
-; SI-NEXT: v_writelane_b32 v40, s85, 27
-; SI-NEXT: v_writelane_b32 v40, s86, 28
-; SI-NEXT: v_writelane_b32 v40, s87, 29
-; SI-NEXT: v_writelane_b32 v40, s96, 30
-; SI-NEXT: v_writelane_b32 v40, s97, 31
-; SI-NEXT: v_writelane_b32 v40, s98, 32
; SI-NEXT: v_readfirstlane_b32 s28, v3
-; SI-NEXT: v_writelane_b32 v40, s99, 33
-; SI-NEXT: v_writelane_b32 v40, s30, 34
-; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s56, v19
@@ -193279,10 +194205,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-LABEL: bitcast_v128i8_to_v64i16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s63, v12
-; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:280
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -193299,6 +194221,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_readfirstlane_b32 s63, v12
+; VI-NEXT: v_readfirstlane_b32 s8, v0
+; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:280
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
; VI-NEXT: v_readfirstlane_b32 s45, v11
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:12
@@ -194168,19 +195094,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-LABEL: bitcast_v128i8_to_v64i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s46, v12
-; GFX9-NEXT: v_readfirstlane_b32 s61, v11
-; GFX9-NEXT: v_readfirstlane_b32 s63, v10
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
-; GFX9-NEXT: v_readfirstlane_b32 s43, v2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v5
-; GFX9-NEXT: v_readfirstlane_b32 s10, v4
-; GFX9-NEXT: v_readfirstlane_b32 s15, v3
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -194197,6 +195110,19 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_readfirstlane_b32 s46, v12
+; GFX9-NEXT: v_readfirstlane_b32 s61, v11
+; GFX9-NEXT: v_readfirstlane_b32 s63, v10
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; GFX9-NEXT: v_readfirstlane_b32 s43, v2
+; GFX9-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v5
+; GFX9-NEXT: v_readfirstlane_b32 s10, v4
+; GFX9-NEXT: v_readfirstlane_b32 s15, v3
; GFX9-NEXT: v_readfirstlane_b32 s12, v9
; GFX9-NEXT: v_readfirstlane_b32 s14, v8
; GFX9-NEXT: v_readfirstlane_b32 s47, v7
@@ -194220,7 +195146,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s58, v16
; GFX9-NEXT: v_readfirstlane_b32 s73, v15
; GFX9-NEXT: v_readfirstlane_b32 s75, v14
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -195086,6 +196012,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
+; GFX11-TRUE16-NEXT: ; meta instruction
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:308
@@ -195201,9 +196131,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s75, 0
-; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:320
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -195604,6 +196531,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:320 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_clause 0x1f
; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:312
; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:308
@@ -195719,7 +196647,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s75, 0
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:320 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: s_and_b32 s76, vcc_lo, exec_lo
@@ -198145,6 +199072,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v64i16_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32
@@ -198169,22 +199112,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v16
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v12
@@ -198208,6 +199135,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v36
; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v37
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -199305,6 +200233,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v64i16_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -199422,23 +200366,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: ; kill: killed $vgpr48
; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; kill: killed $vgpr34
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: ; implicit-def: $vgpr56
@@ -199475,6 +200402,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
@@ -199498,7 +200426,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -199526,7 +200454,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13
@@ -199686,7 +200614,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: s_cbranch_execz .LBB98_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: s_waitcnt vmcnt(12)
; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
@@ -200566,32 +201494,52 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) #0 {
; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: s_clause 0x14 ; 84-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:92
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:88
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:84
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:80
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:76
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:72
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:68
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:64
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:60
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:56
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:52
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:48
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:44
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:40
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:36
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:32
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:28
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:24
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:20
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:16
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:12
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -201115,33 +202063,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v20, s83, 25
; SI-NEXT: v_writelane_b32 v20, s84, 26
; SI-NEXT: v_writelane_b32 v20, s85, 27
-; SI-NEXT: v_readfirstlane_b32 s37, v1
; SI-NEXT: v_writelane_b32 v20, s86, 28
+; SI-NEXT: v_writelane_b32 v20, s87, 29
+; SI-NEXT: v_writelane_b32 v20, s96, 30
+; SI-NEXT: v_writelane_b32 v20, s97, 31
+; SI-NEXT: v_writelane_b32 v20, s98, 32
+; SI-NEXT: v_writelane_b32 v20, s99, 33
+; SI-NEXT: v_writelane_b32 v20, s30, 34
+; SI-NEXT: v_writelane_b32 v20, s31, 35
+; SI-NEXT: v_readfirstlane_b32 s37, v1
; SI-NEXT: v_readfirstlane_b32 s39, v3
; SI-NEXT: s_lshr_b32 s6, s37, 16
; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v20, s87, 29
; SI-NEXT: v_readfirstlane_b32 s49, v5
; SI-NEXT: s_lshr_b32 s7, s39, 16
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_writelane_b32 v21, s6, 0
-; SI-NEXT: v_writelane_b32 v20, s96, 30
; SI-NEXT: v_readfirstlane_b32 s51, v7
; SI-NEXT: s_lshr_b32 s8, s49, 16
; SI-NEXT: v_writelane_b32 v21, s7, 1
-; SI-NEXT: v_writelane_b32 v20, s97, 31
; SI-NEXT: v_readfirstlane_b32 s53, v9
; SI-NEXT: s_lshr_b32 s9, s51, 16
; SI-NEXT: v_writelane_b32 v21, s8, 2
-; SI-NEXT: v_writelane_b32 v20, s98, 32
; SI-NEXT: v_readfirstlane_b32 s55, v11
; SI-NEXT: s_lshr_b32 s10, s53, 16
; SI-NEXT: v_writelane_b32 v21, s9, 3
-; SI-NEXT: v_writelane_b32 v20, s99, 33
; SI-NEXT: v_readfirstlane_b32 s65, v13
; SI-NEXT: s_lshr_b32 s11, s55, 16
; SI-NEXT: v_writelane_b32 v21, s10, 4
-; SI-NEXT: v_writelane_b32 v20, s30, 34
; SI-NEXT: v_readfirstlane_b32 s68, v18
; SI-NEXT: v_readfirstlane_b32 s69, v17
; SI-NEXT: v_readfirstlane_b32 s66, v16
@@ -201155,7 +202104,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s38, v4
; SI-NEXT: v_readfirstlane_b32 s36, v2
; SI-NEXT: v_writelane_b32 v21, s11, 5
-; SI-NEXT: v_writelane_b32 v20, s31, 35
; SI-NEXT: s_lshr_b32 s92, s29, 16
; SI-NEXT: s_lshr_b32 s99, s28, 16
; SI-NEXT: s_lshr_b32 s93, s27, 16
@@ -202682,8 +203630,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: v_writelane_b32 v32, s87, 29
-; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s44, v19
; VI-NEXT: v_readfirstlane_b32 s5, v18
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: v_readfirstlane_b32 s7, v16
@@ -202703,7 +203652,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s45, v2
; VI-NEXT: s_cmp_lg_u32 s44, 0
; VI-NEXT: v_readfirstlane_b32 s44, v1
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB99_4
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -203694,6 +204642,20 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -203729,8 +204691,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v63, s98, 32
; GFX9-NEXT: v_writelane_b32 v63, s99, 33
; GFX9-NEXT: v_writelane_b32 v63, s30, 34
-; GFX9-NEXT: v_readfirstlane_b32 s4, v19
; GFX9-NEXT: v_writelane_b32 v63, s31, 35
+; GFX9-NEXT: v_readfirstlane_b32 s4, v19
; GFX9-NEXT: v_readfirstlane_b32 s7, v18
; GFX9-NEXT: v_readfirstlane_b32 s6, v17
; GFX9-NEXT: v_readfirstlane_b32 s9, v16
@@ -203750,20 +204712,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB99_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -204930,62 +205878,51 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v74, s34, 0
-; GFX11-NEXT: v_writelane_b32 v75, s98, 0
-; GFX11-NEXT: v_readfirstlane_b32 s42, v15
-; GFX11-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-NEXT: v_writelane_b32 v74, s35, 1
-; GFX11-NEXT: v_writelane_b32 v75, s99, 1
-; GFX11-NEXT: v_readfirstlane_b32 s7, v12
-; GFX11-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-NEXT: v_writelane_b32 v74, s36, 2
-; GFX11-NEXT: v_writelane_b32 v75, s100, 2
-; GFX11-NEXT: v_readfirstlane_b32 s8, v9
-; GFX11-NEXT: v_readfirstlane_b32 s11, v8
-; GFX11-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-NEXT: v_writelane_b32 v74, s37, 3
-; GFX11-NEXT: v_writelane_b32 v75, s101, 3
-; GFX11-NEXT: v_readfirstlane_b32 s13, v6
-; GFX11-NEXT: v_readfirstlane_b32 s12, v5
-; GFX11-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-NEXT: v_writelane_b32 v74, s38, 4
-; GFX11-NEXT: v_writelane_b32 v75, s102, 4
-; GFX11-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-NEXT: v_readfirstlane_b32 s41, v2
-; GFX11-NEXT: v_readfirstlane_b32 s40, v1
-; GFX11-NEXT: v_writelane_b32 v74, s39, 5
-; GFX11-NEXT: v_writelane_b32 v75, s103, 5
-; GFX11-NEXT: s_cmp_lg_u32 s42, 0
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v73, s32
+; GFX11-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-NEXT: v_writelane_b32 v74, s39, 5
; GFX11-NEXT: v_writelane_b32 v74, s48, 6
-; GFX11-NEXT: v_writelane_b32 v75, s104, 6
-; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: v_writelane_b32 v74, s49, 7
-; GFX11-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-NEXT: v_writelane_b32 v74, s50, 8
-; GFX11-NEXT: v_writelane_b32 v75, s31, 8
; GFX11-NEXT: v_writelane_b32 v74, s51, 9
; GFX11-NEXT: v_writelane_b32 v74, s52, 10
; GFX11-NEXT: v_writelane_b32 v74, s53, 11
@@ -205009,6 +205946,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_writelane_b32 v74, s87, 29
; GFX11-NEXT: v_writelane_b32 v74, s96, 30
; GFX11-NEXT: v_writelane_b32 v74, s97, 31
+; GFX11-NEXT: v_writelane_b32 v75, s98, 0
+; GFX11-NEXT: v_writelane_b32 v75, s99, 1
+; GFX11-NEXT: v_writelane_b32 v75, s100, 2
+; GFX11-NEXT: v_writelane_b32 v75, s101, 3
+; GFX11-NEXT: v_writelane_b32 v75, s102, 4
+; GFX11-NEXT: v_writelane_b32 v75, s103, 5
+; GFX11-NEXT: v_writelane_b32 v75, s104, 6
+; GFX11-NEXT: v_writelane_b32 v75, s30, 7
+; GFX11-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-NEXT: v_readfirstlane_b32 s42, v15
+; GFX11-NEXT: v_readfirstlane_b32 s5, v14
+; GFX11-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-NEXT: v_readfirstlane_b32 s6, v11
+; GFX11-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-NEXT: v_readfirstlane_b32 s11, v8
+; GFX11-NEXT: v_readfirstlane_b32 s10, v7
+; GFX11-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-NEXT: v_readfirstlane_b32 s12, v5
+; GFX11-NEXT: v_readfirstlane_b32 s15, v4
+; GFX11-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-NEXT: v_readfirstlane_b32 s41, v2
+; GFX11-NEXT: v_readfirstlane_b32 s40, v1
+; GFX11-NEXT: s_cmp_lg_u32 s42, 0
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
+; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: s_cbranch_scc0 .LBB99_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 24
@@ -205802,6 +206767,22 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64bf16_to_v64f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
@@ -205830,28 +206811,11 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23
; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v8
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
@@ -206965,8 +207929,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v64bf16_to_v64f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -206983,7 +207945,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -207232,6 +208196,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; VI-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -207597,9 +208562,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v64bf16_to_v64f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -207616,7 +208578,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -207835,7 +208799,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -208137,20 +209101,35 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v48, v16
; GFX11-TRUE16-NEXT: s_clause 0x1
@@ -209267,14 +210246,28 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
; SI-NEXT: v_writelane_b32 v63, s37, 3
; SI-NEXT: v_writelane_b32 v63, s30, 4
-; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: v_writelane_b32 v63, s31, 5
+; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: s_and_b32 s6, s28, 0xffff0000
; SI-NEXT: v_readfirstlane_b32 s92, v4
; SI-NEXT: s_and_b32 s30, vcc_lo, 0xffff0000
@@ -209370,12 +210363,12 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: s_and_b32 s59, s58, 0xffff0000
; SI-NEXT: s_lshl_b32 s58, s58, 16
; SI-NEXT: s_and_b32 s63, s62, 0xffff0000
+; SI-NEXT: s_lshl_b32 s62, s62, 16
+; SI-NEXT: s_and_b32 s75, s74, 0xffff0000
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s11
-; SI-NEXT: s_lshl_b32 s62, s62, 16
-; SI-NEXT: s_and_b32 s75, s74, 0xffff0000
; SI-NEXT: s_lshl_b32 s74, s74, 16
; SI-NEXT: s_lshl_b32 s76, s76, 16
; SI-NEXT: s_lshl_b32 s78, s78, 16
@@ -209388,21 +210381,6 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: s_and_b32 s36, vcc_lo, 0xffff0000
; SI-NEXT: s_lshl_b32 s37, vcc_lo, 16
; SI-NEXT: v_readfirstlane_b32 vcc_lo, v18
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0
; SI-NEXT: v_mul_f32_e64 v39, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v44, 1.0, s40
@@ -209412,7 +210390,6 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s26
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v62, 1.0, s14
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s12
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s10
@@ -209432,15 +210409,15 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s13
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mul_f32_e64 v0, 1.0, s31
; SI-NEXT: v_mul_f32_e64 v16, 1.0, s9
; SI-NEXT: v_mul_f32_e64 v19, 1.0, s5
; SI-NEXT: v_mul_f32_e64 v22, 1.0, s35
; SI-NEXT: v_mul_f32_e64 v35, 1.0, s94
; SI-NEXT: v_mul_f32_e64 v34, 1.0, s90
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mul_f32_e64 v0, 1.0, s31
; SI-NEXT: v_mul_f32_e64 v31, 1.0, s78
; SI-NEXT: v_mul_f32_e64 v28, 1.0, s74
; SI-NEXT: v_mul_f32_e64 v25, 1.0, s62
@@ -210857,8 +211834,8 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_writelane_b32 v35, s50, 6
; VI-NEXT: v_writelane_b32 v35, s51, 7
; VI-NEXT: v_writelane_b32 v35, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v35, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -211542,8 +212519,8 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; GFX9-NEXT: v_writelane_b32 v36, s50, 6
; GFX9-NEXT: v_writelane_b32 v36, s51, 7
; GFX9-NEXT: v_writelane_b32 v36, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v36, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -212237,36 +213214,36 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s37, s29
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s36, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB101_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -212927,36 +213904,36 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s37, s29
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-FAKE16-NEXT: s_mov_b32 s36, s28
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB101_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -215005,7 +215982,20 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -215035,28 +216025,27 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_writelane_b32 v63, s84, 26
; SI-NEXT: v_writelane_b32 v63, s85, 27
; SI-NEXT: v_writelane_b32 v63, s86, 28
+; SI-NEXT: v_writelane_b32 v63, s87, 29
+; SI-NEXT: v_writelane_b32 v63, s96, 30
+; SI-NEXT: v_writelane_b32 v63, s97, 31
+; SI-NEXT: v_writelane_b32 v63, s98, 32
+; SI-NEXT: v_writelane_b32 v63, s99, 33
+; SI-NEXT: v_writelane_b32 v63, s30, 34
+; SI-NEXT: v_writelane_b32 v63, s31, 35
; SI-NEXT: s_lshr_b32 s5, s16, 16
; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v63, s87, 29
; SI-NEXT: s_lshr_b32 s6, s17, 16
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v62, s5, 0
-; SI-NEXT: v_writelane_b32 v63, s96, 30
; SI-NEXT: s_lshr_b32 s7, s18, 16
; SI-NEXT: v_writelane_b32 v62, s6, 1
-; SI-NEXT: v_writelane_b32 v63, s97, 31
; SI-NEXT: s_lshr_b32 s8, s19, 16
; SI-NEXT: v_writelane_b32 v62, s7, 2
-; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: s_lshr_b32 s9, s20, 16
; SI-NEXT: v_writelane_b32 v62, s8, 3
-; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: s_lshr_b32 s10, s21, 16
; SI-NEXT: v_writelane_b32 v62, s9, 4
-; SI-NEXT: v_writelane_b32 v63, s30, 34
; SI-NEXT: s_lshr_b32 s11, s22, 16
; SI-NEXT: v_writelane_b32 v62, s10, 5
-; SI-NEXT: v_writelane_b32 v63, s31, 35
; SI-NEXT: s_lshr_b32 s12, s23, 16
; SI-NEXT: v_readfirstlane_b32 s52, v17
; SI-NEXT: v_readfirstlane_b32 s48, v16
@@ -215105,20 +216094,6 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_writelane_b32 v62, s12, 7
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_writelane_b32 v62, s13, 8
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB103_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshl_b32 s4, s16, 16
@@ -215974,8 +216949,8 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; VI-NEXT: v_writelane_b32 v33, s50, 6
; VI-NEXT: v_writelane_b32 v33, s51, 7
; VI-NEXT: v_writelane_b32 v33, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v33, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -216228,8 +217203,8 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -216346,36 +217321,36 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -216468,8 +217443,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64bf16_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -216486,6 +217459,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1
@@ -216603,7 +217578,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v12
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
; SI-NEXT: s_waitcnt expcnt(0)
@@ -216667,7 +217643,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_mul_f32_e32 v8, 1.0, v17
; SI-NEXT: ; kill: killed $vgpr15
; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v31
@@ -217594,8 +218569,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v64bf16_to_v64i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -217612,7 +218585,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -217861,6 +218836,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; VI-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -218226,9 +219202,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v64bf16_to_v64i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -218245,7 +219218,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -218464,7 +219439,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) #0 {
; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -219820,14 +220795,28 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
; SI-NEXT: v_writelane_b32 v63, s37, 3
; SI-NEXT: v_writelane_b32 v63, s30, 4
-; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: v_writelane_b32 v63, s31, 5
+; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: s_and_b32 s14, s24, 0xffff0000
; SI-NEXT: s_lshl_b32 s15, s24, 16
; SI-NEXT: s_and_b32 s24, s23, 0xffff0000
@@ -219914,35 +220903,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s13
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0
; SI-NEXT: v_mul_f32_e64 v42, 1.0, s16
; SI-NEXT: v_mul_f32_e64 v46, 1.0, s41
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v47, 1.0, s40
; SI-NEXT: v_mul_f32_e64 v38, 1.0, s18
; SI-NEXT: v_mul_f32_e64 v45, 1.0, s29
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_mul_f32_e64 v58, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v57, 1.0, s28
; SI-NEXT: v_mul_f32_e64 v43, 1.0, s20
; SI-NEXT: v_mul_f32_e64 v44, 1.0, s27
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mul_f32_e64 v61, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v59, 1.0, s26
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s22
@@ -221035,8 +222006,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_writelane_b32 v35, s50, 6
; VI-NEXT: v_writelane_b32 v35, s51, 7
; VI-NEXT: v_writelane_b32 v35, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v35, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -221720,8 +222691,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_writelane_b32 v36, s50, 6
; GFX9-NEXT: v_writelane_b32 v36, s51, 7
; GFX9-NEXT: v_writelane_b32 v36, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v36, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -222383,36 +223354,36 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s37, s29
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 3
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s36, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -222998,36 +223969,36 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s37, s29
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 3
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-FAKE16-NEXT: s_mov_b32 s36, s28
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 4
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 5
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 6
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 7
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -224683,40 +225654,40 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: v_writelane_b32 v33, s80, 22
; SI-NEXT: v_writelane_b32 v33, s81, 23
; SI-NEXT: v_writelane_b32 v33, s82, 24
+; SI-NEXT: v_writelane_b32 v33, s83, 25
+; SI-NEXT: v_writelane_b32 v33, s84, 26
+; SI-NEXT: v_writelane_b32 v33, s85, 27
+; SI-NEXT: v_writelane_b32 v33, s86, 28
+; SI-NEXT: v_writelane_b32 v33, s87, 29
+; SI-NEXT: v_writelane_b32 v33, s96, 30
+; SI-NEXT: v_writelane_b32 v33, s97, 31
+; SI-NEXT: v_writelane_b32 v33, s98, 32
+; SI-NEXT: v_writelane_b32 v33, s99, 33
+; SI-NEXT: v_writelane_b32 v33, s30, 34
+; SI-NEXT: v_writelane_b32 v33, s31, 35
; SI-NEXT: s_lshr_b32 s5, s16, 16
; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v33, s83, 25
; SI-NEXT: s_lshr_b32 s6, s17, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v34, s5, 0
-; SI-NEXT: v_writelane_b32 v33, s84, 26
; SI-NEXT: s_lshr_b32 s8, s18, 16
; SI-NEXT: v_writelane_b32 v34, s6, 1
-; SI-NEXT: v_writelane_b32 v33, s85, 27
; SI-NEXT: s_lshr_b32 s90, s19, 16
; SI-NEXT: v_writelane_b32 v34, s8, 2
-; SI-NEXT: v_writelane_b32 v33, s86, 28
; SI-NEXT: s_lshr_b32 s91, s20, 16
; SI-NEXT: v_writelane_b32 v34, s90, 3
-; SI-NEXT: v_writelane_b32 v33, s87, 29
; SI-NEXT: s_lshr_b32 s92, s21, 16
; SI-NEXT: v_writelane_b32 v34, s91, 4
-; SI-NEXT: v_writelane_b32 v33, s96, 30
; SI-NEXT: s_lshr_b32 s93, s22, 16
; SI-NEXT: v_writelane_b32 v34, s92, 5
-; SI-NEXT: v_writelane_b32 v33, s97, 31
; SI-NEXT: s_lshr_b32 s94, s23, 16
; SI-NEXT: v_writelane_b32 v34, s93, 6
-; SI-NEXT: v_writelane_b32 v33, s98, 32
; SI-NEXT: s_lshr_b32 s95, s24, 16
; SI-NEXT: v_writelane_b32 v34, s94, 7
-; SI-NEXT: v_writelane_b32 v33, s99, 33
; SI-NEXT: s_lshr_b32 vcc_lo, s25, 16
; SI-NEXT: v_writelane_b32 v34, s95, 8
-; SI-NEXT: v_writelane_b32 v33, s30, 34
; SI-NEXT: s_lshr_b32 vcc_hi, s26, 16
; SI-NEXT: v_writelane_b32 v34, vcc_lo, 9
-; SI-NEXT: v_writelane_b32 v33, s31, 35
; SI-NEXT: s_lshr_b32 s30, s27, 16
; SI-NEXT: v_writelane_b32 v34, vcc_hi, 10
; SI-NEXT: s_lshr_b32 s31, s28, 16
@@ -225435,8 +226406,9 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; VI-NEXT: v_writelane_b32 v32, s37, 3
; VI-NEXT: v_writelane_b32 v32, s38, 4
; VI-NEXT: v_writelane_b32 v32, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s30, 6
+; VI-NEXT: v_writelane_b32 v32, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s8, v17
; VI-NEXT: v_readfirstlane_b32 s9, v16
; VI-NEXT: v_readfirstlane_b32 s10, v15
@@ -225456,7 +226428,6 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; VI-NEXT: v_readfirstlane_b32 s6, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s7, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB107_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB107_3
@@ -225685,8 +226656,8 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -225802,36 +226773,36 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB107_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -225924,8 +226895,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64f16_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -225942,6 +226911,8 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v30
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29
; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v28
@@ -225981,8 +226952,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) #0 {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -226641,7 +227613,21 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s36, 2
@@ -226702,21 +227688,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s95, s35, 16
; SI-NEXT: v_readfirstlane_b32 s4, v18
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB109_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB109_4
@@ -227307,8 +228278,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v33, s50, 6
; VI-NEXT: v_writelane_b32 v33, s51, 7
; VI-NEXT: v_writelane_b32 v33, s30, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v33, s31, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s51, v17
; VI-NEXT: v_readfirstlane_b32 s50, v16
; VI-NEXT: v_readfirstlane_b32 s49, v15
@@ -227561,8 +228532,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -227679,36 +228650,36 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB109_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -227801,6 +228772,22 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64i16_to_v64f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -227871,23 +228858,6 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38
; SI-NEXT: ; kill: killed $vgpr50
; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -229014,46 +229984,46 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s69, 19
; SI-NEXT: v_writelane_b32 v32, s70, 20
; SI-NEXT: v_writelane_b32 v32, s71, 21
+; SI-NEXT: v_writelane_b32 v32, s80, 22
+; SI-NEXT: v_writelane_b32 v32, s81, 23
+; SI-NEXT: v_writelane_b32 v32, s82, 24
+; SI-NEXT: v_writelane_b32 v32, s83, 25
+; SI-NEXT: v_writelane_b32 v32, s84, 26
+; SI-NEXT: v_writelane_b32 v32, s85, 27
+; SI-NEXT: v_writelane_b32 v32, s86, 28
+; SI-NEXT: v_writelane_b32 v32, s87, 29
+; SI-NEXT: v_writelane_b32 v32, s96, 30
+; SI-NEXT: v_writelane_b32 v32, s97, 31
+; SI-NEXT: v_writelane_b32 v32, s98, 32
+; SI-NEXT: v_writelane_b32 v32, s99, 33
+; SI-NEXT: v_writelane_b32 v32, s30, 34
+; SI-NEXT: v_writelane_b32 v32, s31, 35
; SI-NEXT: s_lshr_b32 s4, s27, 16
; SI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v32, s80, 22
+; SI-NEXT: s_mov_b32 s46, s16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v33, s4, 0
; SI-NEXT: s_lshr_b32 s4, s25, 16
-; SI-NEXT: v_writelane_b32 v32, s81, 23
; SI-NEXT: v_writelane_b32 v33, s4, 1
; SI-NEXT: s_lshr_b32 s4, s24, 16
-; SI-NEXT: v_writelane_b32 v32, s82, 24
; SI-NEXT: v_writelane_b32 v33, s4, 2
-; SI-NEXT: v_writelane_b32 v32, s83, 25
; SI-NEXT: v_writelane_b32 v33, s23, 3
; SI-NEXT: s_lshr_b32 s4, s23, 16
-; SI-NEXT: v_writelane_b32 v32, s84, 26
; SI-NEXT: v_writelane_b32 v33, s4, 4
; SI-NEXT: s_lshr_b32 s4, s21, 16
-; SI-NEXT: v_writelane_b32 v32, s85, 27
; SI-NEXT: v_writelane_b32 v33, s4, 5
-; SI-NEXT: v_writelane_b32 v32, s86, 28
; SI-NEXT: v_writelane_b32 v33, s19, 6
; SI-NEXT: s_lshr_b32 s4, s19, 16
-; SI-NEXT: v_writelane_b32 v32, s87, 29
; SI-NEXT: v_writelane_b32 v33, s4, 7
-; SI-NEXT: v_writelane_b32 v32, s96, 30
; SI-NEXT: v_writelane_b32 v33, s17, 8
; SI-NEXT: s_lshr_b32 s4, s17, 16
-; SI-NEXT: v_writelane_b32 v32, s97, 31
; SI-NEXT: v_writelane_b32 v33, s4, 9
; SI-NEXT: s_lshr_b32 s4, s16, 16
-; SI-NEXT: v_writelane_b32 v32, s98, 32
-; SI-NEXT: s_mov_b32 s46, s16
; SI-NEXT: v_writelane_b32 v33, s4, 10
-; SI-NEXT: v_writelane_b32 v32, s99, 33
; SI-NEXT: s_mov_b32 s47, s18
; SI-NEXT: v_writelane_b32 v33, s46, 11
-; SI-NEXT: v_writelane_b32 v32, s30, 34
; SI-NEXT: s_mov_b32 s57, s20
; SI-NEXT: v_writelane_b32 v33, s47, 12
-; SI-NEXT: v_writelane_b32 v32, s31, 35
; SI-NEXT: s_mov_b32 s31, s22
; SI-NEXT: v_writelane_b32 v33, s57, 13
; SI-NEXT: s_mov_b32 s35, s24
@@ -229799,8 +230769,9 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s37, 3
; VI-NEXT: v_writelane_b32 v32, s38, 4
; VI-NEXT: v_writelane_b32 v32, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_writelane_b32 v32, s30, 6
+; VI-NEXT: v_writelane_b32 v32, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v18
; VI-NEXT: v_readfirstlane_b32 s8, v17
; VI-NEXT: v_readfirstlane_b32 s9, v16
; VI-NEXT: v_readfirstlane_b32 s10, v15
@@ -229820,7 +230791,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s6, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s7, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB111_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB111_3
@@ -230049,8 +231019,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s30, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_writelane_b32 v32, s31, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v18
; GFX9-NEXT: v_readfirstlane_b32 s51, v17
; GFX9-NEXT: v_readfirstlane_b32 s50, v16
; GFX9-NEXT: v_readfirstlane_b32 s49, v15
@@ -230166,36 +231136,36 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v14
+; GFX11-NEXT: v_readfirstlane_b32 s51, v13
+; GFX11-NEXT: v_readfirstlane_b32 s50, v12
+; GFX11-NEXT: v_readfirstlane_b32 s49, v11
+; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: v_readfirstlane_b32 s47, v9
; GFX11-NEXT: v_readfirstlane_b32 s46, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s45, v7
; GFX11-NEXT: v_readfirstlane_b32 s44, v6
; GFX11-NEXT: v_readfirstlane_b32 s43, v5
; GFX11-NEXT: v_readfirstlane_b32 s42, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s41, v3
; GFX11-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: v_readfirstlane_b32 s38, v0
; GFX11-NEXT: s_mov_b32 s37, s29
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: v_readfirstlane_b32 s39, v1
; GFX11-NEXT: s_mov_b32 s36, s28
; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: v_readfirstlane_b32 s48, v10
; GFX11-NEXT: s_mov_b32 s13, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: v_readfirstlane_b32 s49, v11
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: v_readfirstlane_b32 s50, v12
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: v_readfirstlane_b32 s51, v13
; GFX11-NEXT: s_cbranch_scc0 .LBB111_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index b64daf30bb761..224b73353c3bc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -40851,6 +40851,18 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
; SI-LABEL: bitcast_v16bf16_to_v32i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_b32 s4, s23, 0xffff0000
; SI-NEXT: s_lshl_b32 s5, s23, 16
; SI-NEXT: s_and_b32 s6, s22, 0xffff0000
@@ -40867,18 +40879,6 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
; SI-NEXT: s_lshl_b32 s17, s17, 16
; SI-NEXT: s_and_b32 s19, s16, 0xffff0000
; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 s24, 0
; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_mul_f32_e64 v45, 1.0, s19
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 442767fc1162d..aba9755a4f357 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -15080,16 +15080,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v40i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
@@ -15103,6 +15093,16 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v20
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v24
@@ -15436,6 +15436,10 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v20i16_to_v40i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v10
; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
@@ -15447,10 +15451,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) #0 {
; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3
; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr15
@@ -18433,8 +18433,8 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_writelane_b32 v27, s38, 4
; SI-NEXT: v_writelane_b32 v27, s39, 5
; SI-NEXT: v_writelane_b32 v27, s30, 6
-; SI-NEXT: v_readfirstlane_b32 s4, v26
; SI-NEXT: v_writelane_b32 v27, s31, 7
+; SI-NEXT: v_readfirstlane_b32 s4, v26
; SI-NEXT: v_readfirstlane_b32 s90, v25
; SI-NEXT: v_readfirstlane_b32 s91, v24
; SI-NEXT: v_readfirstlane_b32 s93, v23
@@ -19778,6 +19778,8 @@ define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v10, s39, 3
; SI-NEXT: v_writelane_b32 v10, s48, 4
; SI-NEXT: v_writelane_b32 v10, s49, 5
+; SI-NEXT: v_writelane_b32 v10, s50, 6
+; SI-NEXT: v_writelane_b32 v10, s51, 7
; SI-NEXT: s_lshr_b32 s6, s25, 16
; SI-NEXT: s_lshr_b32 s7, s24, 16
; SI-NEXT: s_lshr_b32 s8, s23, 16
@@ -19788,9 +19790,7 @@ define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s13, s18, 16
; SI-NEXT: s_lshr_b32 s14, s17, 16
; SI-NEXT: s_lshr_b32 s15, s16, 16
-; SI-NEXT: v_writelane_b32 v10, s50, 6
; SI-NEXT: s_cmp_lg_u32 s26, 0
-; SI-NEXT: v_writelane_b32 v10, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB53_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -20697,6 +20697,8 @@ define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32
; SI-NEXT: v_writelane_b32 v10, s39, 3
; SI-NEXT: v_writelane_b32 v10, s48, 4
; SI-NEXT: v_writelane_b32 v10, s49, 5
+; SI-NEXT: v_writelane_b32 v10, s50, 6
+; SI-NEXT: v_writelane_b32 v10, s51, 7
; SI-NEXT: s_lshr_b32 s6, s25, 16
; SI-NEXT: s_lshr_b32 s7, s24, 16
; SI-NEXT: s_lshr_b32 s8, s23, 16
@@ -20707,9 +20709,7 @@ define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32
; SI-NEXT: s_lshr_b32 s13, s18, 16
; SI-NEXT: s_lshr_b32 s14, s17, 16
; SI-NEXT: s_lshr_b32 s15, s16, 16
-; SI-NEXT: v_writelane_b32 v10, s50, 6
; SI-NEXT: s_cmp_lg_u32 s26, 0
-; SI-NEXT: v_writelane_b32 v10, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB57_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -24775,8 +24775,8 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_writelane_b32 v27, s38, 4
; SI-NEXT: v_writelane_b32 v27, s39, 5
; SI-NEXT: v_writelane_b32 v27, s30, 6
-; SI-NEXT: v_readfirstlane_b32 s4, v26
; SI-NEXT: v_writelane_b32 v27, s31, 7
+; SI-NEXT: v_readfirstlane_b32 s4, v26
; SI-NEXT: v_readfirstlane_b32 s90, v25
; SI-NEXT: v_readfirstlane_b32 s91, v24
; SI-NEXT: v_readfirstlane_b32 s93, v23
@@ -26160,6 +26160,8 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_lshr_b32 s6, s25, 16
; SI-NEXT: s_lshr_b32 s7, s24, 16
; SI-NEXT: s_lshr_b32 s8, s23, 16
@@ -26170,9 +26172,7 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s13, s18, 16
; SI-NEXT: s_lshr_b32 s14, s17, 16
; SI-NEXT: s_lshr_b32 s15, s16, 16
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s26, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB65_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -27166,6 +27166,8 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_lshr_b32 s6, s25, 16
; SI-NEXT: s_lshr_b32 s7, s24, 16
; SI-NEXT: s_lshr_b32 s8, s23, 16
@@ -27176,9 +27178,7 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32
; SI-NEXT: s_lshr_b32 s13, s18, 16
; SI-NEXT: s_lshr_b32 s14, s17, 16
; SI-NEXT: s_lshr_b32 s15, s16, 16
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s26, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB69_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -27892,6 +27892,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v10
; SI-NEXT: v_mov_b32_e32 v35, v8
; SI-NEXT: v_mov_b32_e32 v34, v6
@@ -27908,15 +27917,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) #0 {
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: s_waitcnt expcnt(0)
@@ -27935,14 +27935,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28005,7 +28008,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_or_b32_e32 v8, v8, v23
@@ -28145,7 +28148,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
@@ -29154,8 +29157,9 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_writelane_b32 v27, s39, 3
; SI-NEXT: v_writelane_b32 v27, s48, 4
; SI-NEXT: v_writelane_b32 v27, s49, 5
-; SI-NEXT: v_readfirstlane_b32 s4, v26
; SI-NEXT: v_writelane_b32 v27, s50, 6
+; SI-NEXT: v_writelane_b32 v27, s51, 7
+; SI-NEXT: v_readfirstlane_b32 s4, v26
; SI-NEXT: v_readfirstlane_b32 s6, v25
; SI-NEXT: v_readfirstlane_b32 s7, v24
; SI-NEXT: v_readfirstlane_b32 s8, v23
@@ -29183,7 +29187,6 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_readfirstlane_b32 s78, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s79, v0
-; SI-NEXT: v_writelane_b32 v27, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB73_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xff
@@ -31736,6 +31739,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v10
; SI-NEXT: v_mov_b32_e32 v35, v8
; SI-NEXT: v_mov_b32_e32 v34, v6
@@ -31752,15 +31764,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) #0 {
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: s_waitcnt expcnt(0)
@@ -31779,14 +31782,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -31849,7 +31855,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_or_b32_e32 v8, v8, v23
@@ -31989,7 +31995,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) #0 {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
@@ -32998,8 +33004,9 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; SI-NEXT: v_writelane_b32 v27, s39, 3
; SI-NEXT: v_writelane_b32 v27, s48, 4
; SI-NEXT: v_writelane_b32 v27, s49, 5
-; SI-NEXT: v_readfirstlane_b32 s4, v26
; SI-NEXT: v_writelane_b32 v27, s50, 6
+; SI-NEXT: v_writelane_b32 v27, s51, 7
+; SI-NEXT: v_readfirstlane_b32 s4, v26
; SI-NEXT: v_readfirstlane_b32 s6, v25
; SI-NEXT: v_readfirstlane_b32 s7, v24
; SI-NEXT: v_readfirstlane_b32 s8, v23
@@ -33027,7 +33034,6 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in
; SI-NEXT: v_readfirstlane_b32 s78, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s79, v0
-; SI-NEXT: v_writelane_b32 v27, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB77_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xff
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
index cb0e72323a165..6d91f0ea85573 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
@@ -8254,6 +8254,8 @@ define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v12, s39, 3
; SI-NEXT: v_writelane_b32 v12, s48, 4
; SI-NEXT: v_writelane_b32 v12, s49, 5
+; SI-NEXT: v_writelane_b32 v12, s50, 6
+; SI-NEXT: v_writelane_b32 v12, s51, 7
; SI-NEXT: s_lshr_b32 s6, s27, 16
; SI-NEXT: s_lshr_b32 s7, s26, 16
; SI-NEXT: s_lshr_b32 s8, s25, 16
@@ -8266,9 +8268,7 @@ define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s15, s18, 16
; SI-NEXT: s_lshr_b32 s29, s17, 16
; SI-NEXT: s_lshr_b32 s56, s16, 16
-; SI-NEXT: v_writelane_b32 v12, s50, 6
; SI-NEXT: s_cmp_lg_u32 s28, 0
-; SI-NEXT: v_writelane_b32 v12, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB43_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -9325,6 +9325,8 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_lshr_b32 s6, s27, 16
; SI-NEXT: s_lshr_b32 s7, s26, 16
; SI-NEXT: s_lshr_b32 s8, s25, 16
@@ -9337,9 +9339,7 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s15, s18, 16
; SI-NEXT: s_lshr_b32 s29, s17, 16
; SI-NEXT: s_lshr_b32 s56, s16, 16
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s28, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB47_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -10418,6 +10418,8 @@ define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32
; SI-NEXT: v_writelane_b32 v12, s39, 3
; SI-NEXT: v_writelane_b32 v12, s48, 4
; SI-NEXT: v_writelane_b32 v12, s49, 5
+; SI-NEXT: v_writelane_b32 v12, s50, 6
+; SI-NEXT: v_writelane_b32 v12, s51, 7
; SI-NEXT: s_lshr_b32 s6, s27, 16
; SI-NEXT: s_lshr_b32 s7, s26, 16
; SI-NEXT: s_lshr_b32 s8, s25, 16
@@ -10430,9 +10432,7 @@ define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32
; SI-NEXT: s_lshr_b32 s15, s18, 16
; SI-NEXT: s_lshr_b32 s29, s17, 16
; SI-NEXT: s_lshr_b32 s56, s16, 16
-; SI-NEXT: v_writelane_b32 v12, s50, 6
; SI-NEXT: s_cmp_lg_u32 s28, 0
-; SI-NEXT: v_writelane_b32 v12, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB51_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -11510,6 +11510,8 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_lshr_b32 s6, s27, 16
; SI-NEXT: s_lshr_b32 s7, s26, 16
; SI-NEXT: s_lshr_b32 s8, s25, 16
@@ -11522,9 +11524,7 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32
; SI-NEXT: s_lshr_b32 s15, s18, 16
; SI-NEXT: s_lshr_b32 s29, s17, 16
; SI-NEXT: s_lshr_b32 s56, s16, 16
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s28, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB55_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
index f7b7547cad1bd..04ccab7cc4b61 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
@@ -2606,6 +2606,8 @@ define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v14, s39, 3
; SI-NEXT: v_writelane_b32 v14, s48, 4
; SI-NEXT: v_writelane_b32 v14, s49, 5
+; SI-NEXT: v_writelane_b32 v14, s50, 6
+; SI-NEXT: v_writelane_b32 v14, s51, 7
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s7, s28, 16
; SI-NEXT: s_lshr_b32 s8, s27, 16
@@ -2621,9 +2623,7 @@ define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i3
; SI-NEXT: s_lshr_b32 s58, s17, 16
; SI-NEXT: s_lshr_b32 s59, s16, 16
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v14, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v14, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB15_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -3824,6 +3824,8 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s7, s28, 16
; SI-NEXT: s_lshr_b32 s8, s27, 16
@@ -3839,9 +3841,7 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s58, s17, 16
; SI-NEXT: s_lshr_b32 s59, s16, 16
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB19_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -6257,6 +6257,8 @@ define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v14, s39, 3
; SI-NEXT: v_writelane_b32 v14, s48, 4
; SI-NEXT: v_writelane_b32 v14, s49, 5
+; SI-NEXT: v_writelane_b32 v14, s50, 6
+; SI-NEXT: v_writelane_b32 v14, s51, 7
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s7, s28, 16
; SI-NEXT: s_lshr_b32 s8, s27, 16
@@ -6272,9 +6274,7 @@ define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a,
; SI-NEXT: s_lshr_b32 s58, s17, 16
; SI-NEXT: s_lshr_b32 s59, s16, 16
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v14, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v14, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB31_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -7495,6 +7495,8 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s7, s28, 16
; SI-NEXT: s_lshr_b32 s8, s27, 16
@@ -7510,9 +7512,7 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s58, s17, 16
; SI-NEXT: s_lshr_b32 s59, s16, 16
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB35_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -9285,6 +9285,8 @@ define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32
; SI-NEXT: v_writelane_b32 v14, s39, 3
; SI-NEXT: v_writelane_b32 v14, s48, 4
; SI-NEXT: v_writelane_b32 v14, s49, 5
+; SI-NEXT: v_writelane_b32 v14, s50, 6
+; SI-NEXT: v_writelane_b32 v14, s51, 7
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s7, s28, 16
; SI-NEXT: s_lshr_b32 s8, s27, 16
@@ -9300,9 +9302,7 @@ define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32
; SI-NEXT: s_lshr_b32 s58, s17, 16
; SI-NEXT: s_lshr_b32 s59, s16, 16
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v14, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v14, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB43_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -10507,6 +10507,8 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s7, s28, 16
; SI-NEXT: s_lshr_b32 s8, s27, 16
@@ -10522,9 +10524,7 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32
; SI-NEXT: s_lshr_b32 s58, s17, 16
; SI-NEXT: s_lshr_b32 s59, s16, 16
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB47_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -11691,6 +11691,8 @@ define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v14, s39, 3
; SI-NEXT: v_writelane_b32 v14, s48, 4
; SI-NEXT: v_writelane_b32 v14, s49, 5
+; SI-NEXT: v_writelane_b32 v14, s50, 6
+; SI-NEXT: v_writelane_b32 v14, s51, 7
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s7, s28, 16
; SI-NEXT: s_lshr_b32 s8, s27, 16
@@ -11706,9 +11708,7 @@ define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s58, s17, 16
; SI-NEXT: s_lshr_b32 s59, s16, 16
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v14, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v14, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB51_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -12880,6 +12880,8 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s7, s28, 16
; SI-NEXT: s_lshr_b32 s8, s27, 16
@@ -12895,9 +12897,7 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s58, s17, 16
; SI-NEXT: s_lshr_b32 s59, s16, 16
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB55_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -13302,6 +13302,14 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v28f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11
@@ -13316,14 +13324,6 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) #0 {
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v26
; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v28
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index fd08154118f5c..beca22c49f5eb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -476,8 +476,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v16, s30, 0
-; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_writelane_b32 v16, s31, 1
+; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_readfirstlane_b32 s31, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s30, v0
@@ -537,8 +537,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -598,8 +598,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -1832,8 +1832,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v16, s30, 0
-; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_writelane_b32 v16, s31, 1
+; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_readfirstlane_b32 s31, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s30, v0
@@ -1885,8 +1885,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -1938,8 +1938,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -2915,9 +2915,11 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v16, s38, 2
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
+; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: v_readfirstlane_b32 s9, v1
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v16, s49, 5
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s8, s28, 16
; SI-NEXT: s_lshr_b32 s10, s27, 16
@@ -2935,9 +2937,7 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3
; SI-NEXT: s_lshr_b32 s7, s9, 16
; SI-NEXT: s_lshr_b32 s11, s12, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB15_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -3223,8 +3223,8 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -3871,6 +3871,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v15
; SI-NEXT: v_mov_b32_e32 v17, v14
; SI-NEXT: v_mov_b32_e32 v18, v13
@@ -3887,7 +3888,6 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) #0 {
; SI-NEXT: v_mov_b32_e32 v29, v2
; SI-NEXT: v_mov_b32_e32 v30, v1
; SI-NEXT: v_mov_b32_e32 v31, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18
@@ -4280,9 +4280,11 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v18, s38, 2
; SI-NEXT: v_writelane_b32 v18, s39, 3
; SI-NEXT: v_writelane_b32 v18, s48, 4
+; SI-NEXT: v_writelane_b32 v18, s49, 5
+; SI-NEXT: v_writelane_b32 v18, s50, 6
+; SI-NEXT: v_writelane_b32 v18, s51, 7
; SI-NEXT: v_readfirstlane_b32 s6, v1
; SI-NEXT: v_readfirstlane_b32 s8, v0
-; SI-NEXT: v_writelane_b32 v18, s49, 5
; SI-NEXT: s_lshr_b32 s9, s29, 16
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: s_lshr_b32 s12, s27, 16
@@ -4300,9 +4302,7 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s7, s6, 16
; SI-NEXT: s_lshr_b32 s10, s8, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v18, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB19_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -4526,8 +4526,8 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v17, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v17, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -4652,8 +4652,8 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -6835,6 +6835,22 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; SI-LABEL: bitcast_v32bf16_to_v16i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s42, v1
; SI-NEXT: v_readfirstlane_b32 s44, v0
; SI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -6870,22 +6886,6 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; SI-NEXT: s_and_b32 s45, s44, 0xffff0000
; SI-NEXT: s_lshl_b32 s44, s44, 16
; SI-NEXT: v_readfirstlane_b32 s46, v2
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 s46, 0
; SI-NEXT: v_mul_f32_e64 v41, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v52, 1.0, s16
@@ -7084,8 +7084,8 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v20, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -7426,8 +7426,8 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v20, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -8481,8 +8481,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -8499,6 +8497,8 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr46
@@ -8900,10 +8900,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v16i32_to_v64i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -8920,6 +8916,10 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; VI-NEXT: ; implicit-def: $vgpr17
+; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; implicit-def: $vgpr30
; VI-NEXT: ; implicit-def: $vgpr23
@@ -9216,10 +9216,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v16i32_to_v64i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -9236,6 +9232,10 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: ; implicit-def: $vgpr31
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: ; implicit-def: $vgpr24
@@ -9980,12 +9980,12 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; SI-NEXT: v_writelane_b32 v4, s83, 25
; SI-NEXT: v_writelane_b32 v4, s84, 26
; SI-NEXT: v_writelane_b32 v4, s85, 27
-; SI-NEXT: v_readfirstlane_b32 s4, v3
; SI-NEXT: v_writelane_b32 v4, s30, 28
+; SI-NEXT: v_writelane_b32 v4, s31, 29
+; SI-NEXT: v_readfirstlane_b32 s4, v3
; SI-NEXT: v_readfirstlane_b32 s5, v2
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v1
-; SI-NEXT: v_writelane_b32 v4, s31, 29
; SI-NEXT: s_cbranch_scc0 .LBB25_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 24
@@ -10417,12 +10417,12 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; VI-NEXT: v_writelane_b32 v4, s65, 15
; VI-NEXT: v_writelane_b32 v4, s66, 16
; VI-NEXT: v_writelane_b32 v4, s67, 17
-; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_writelane_b32 v4, s30, 18
+; VI-NEXT: v_writelane_b32 v4, s31, 19
+; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: v_writelane_b32 v4, s31, 19
; VI-NEXT: s_cbranch_scc0 .LBB25_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -10764,12 +10764,12 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v4, s53, 11
; GFX9-NEXT: v_writelane_b32 v4, s54, 12
; GFX9-NEXT: v_writelane_b32 v4, s55, 13
-; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_writelane_b32 v4, s30, 14
+; GFX9-NEXT: v_writelane_b32 v4, s31, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_writelane_b32 v4, s31, 15
; GFX9-NEXT: s_cbranch_scc0 .LBB25_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -11079,8 +11079,6 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX11-NEXT: scratch_store_b32 off, v23, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v23, s34, 0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: v_writelane_b32 v23, s35, 1
; GFX11-NEXT: v_writelane_b32 v23, s36, 2
; GFX11-NEXT: v_writelane_b32 v23, s37, 3
@@ -11089,6 +11087,8 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX11-NEXT: v_writelane_b32 v23, s48, 6
; GFX11-NEXT: v_writelane_b32 v23, s30, 7
; GFX11-NEXT: v_writelane_b32 v23, s31, 8
+; GFX11-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB25_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 24
@@ -15407,8 +15407,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v16, s30, 0
-; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_writelane_b32 v16, s31, 1
+; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_readfirstlane_b32 s31, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s30, v0
@@ -15468,8 +15468,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -15529,8 +15529,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -16117,8 +16117,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v16, s30, 0
-; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_writelane_b32 v16, s31, 1
+; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_readfirstlane_b32 s31, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s30, v0
@@ -16178,8 +16178,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -16239,8 +16239,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -16464,8 +16464,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v16, s30, 0
-; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_writelane_b32 v16, s31, 1
+; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_readfirstlane_b32 s31, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s30, v0
@@ -16517,8 +16517,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -16570,8 +16570,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -17063,8 +17063,8 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a,
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -17124,8 +17124,8 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -17587,9 +17587,11 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v16, s38, 2
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
+; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: v_readfirstlane_b32 s9, v1
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v16, s49, 5
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s8, s28, 16
; SI-NEXT: s_lshr_b32 s10, s27, 16
@@ -17607,9 +17609,7 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a,
; SI-NEXT: s_lshr_b32 s7, s9, 16
; SI-NEXT: s_lshr_b32 s11, s12, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB39_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -17895,8 +17895,8 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -18404,8 +18404,8 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -18465,8 +18465,8 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -18583,6 +18583,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v15
; SI-NEXT: v_mov_b32_e32 v17, v14
; SI-NEXT: v_mov_b32_e32 v18, v13
@@ -18599,7 +18600,6 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) #0 {
; SI-NEXT: v_mov_b32_e32 v29, v2
; SI-NEXT: v_mov_b32_e32 v30, v1
; SI-NEXT: v_mov_b32_e32 v31, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18
@@ -18992,9 +18992,11 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v18, s38, 2
; SI-NEXT: v_writelane_b32 v18, s39, 3
; SI-NEXT: v_writelane_b32 v18, s48, 4
+; SI-NEXT: v_writelane_b32 v18, s49, 5
+; SI-NEXT: v_writelane_b32 v18, s50, 6
+; SI-NEXT: v_writelane_b32 v18, s51, 7
; SI-NEXT: v_readfirstlane_b32 s6, v1
; SI-NEXT: v_readfirstlane_b32 s8, v0
-; SI-NEXT: v_writelane_b32 v18, s49, 5
; SI-NEXT: s_lshr_b32 s9, s29, 16
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: s_lshr_b32 s12, s27, 16
@@ -19012,9 +19014,7 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s7, s6, 16
; SI-NEXT: s_lshr_b32 s10, s8, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v18, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB43_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -19238,8 +19238,8 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a,
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v17, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v17, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -19364,8 +19364,8 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -20018,8 +20018,8 @@ define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -20079,8 +20079,8 @@ define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -21603,6 +21603,22 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; SI-LABEL: bitcast_v32bf16_to_v16f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s42, v1
; SI-NEXT: v_readfirstlane_b32 s44, v0
; SI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -21638,22 +21654,6 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; SI-NEXT: s_and_b32 s45, s44, 0xffff0000
; SI-NEXT: s_lshl_b32 s44, s44, 16
; SI-NEXT: v_readfirstlane_b32 s46, v2
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 s46, 0
; SI-NEXT: v_mul_f32_e64 v41, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v52, 1.0, s16
@@ -21852,8 +21852,8 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v20, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -22194,8 +22194,8 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v20, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -23249,8 +23249,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -23267,6 +23265,8 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr46
@@ -23668,10 +23668,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v16f32_to_v64i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -23688,6 +23684,10 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; VI-NEXT: ; implicit-def: $vgpr17
+; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; implicit-def: $vgpr30
; VI-NEXT: ; implicit-def: $vgpr23
@@ -23984,10 +23984,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v16f32_to_v64i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -24004,6 +24000,10 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: ; implicit-def: $vgpr31
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: ; implicit-def: $vgpr24
@@ -24732,12 +24732,12 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_writelane_b32 v40, s83, 25
; SI-NEXT: v_writelane_b32 v40, s84, 26
; SI-NEXT: v_writelane_b32 v40, s85, 27
-; SI-NEXT: v_readfirstlane_b32 s4, v3
; SI-NEXT: v_writelane_b32 v40, s30, 28
+; SI-NEXT: v_writelane_b32 v40, s31, 29
+; SI-NEXT: v_readfirstlane_b32 s4, v3
; SI-NEXT: v_readfirstlane_b32 s37, v2
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s36, v1
-; SI-NEXT: v_writelane_b32 v40, s31, 29
; SI-NEXT: s_cbranch_scc0 .LBB49_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s82, s37, 24
@@ -25195,6 +25195,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v63, s34, 0
; VI-NEXT: v_writelane_b32 v63, s35, 1
; VI-NEXT: v_writelane_b32 v63, s36, 2
@@ -25214,26 +25229,11 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; VI-NEXT: v_writelane_b32 v63, s66, 16
; VI-NEXT: v_writelane_b32 v63, s67, 17
; VI-NEXT: v_writelane_b32 v63, s30, 18
-; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_writelane_b32 v63, s31, 19
+; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB49_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -25624,6 +25624,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -25639,26 +25654,11 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v63, s54, 12
; GFX9-NEXT: v_writelane_b32 v63, s55, 13
; GFX9-NEXT: v_writelane_b32 v63, s30, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_writelane_b32 v63, s31, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB49_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -26037,8 +26037,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v40, s34, 0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s42, 0
; GFX11-NEXT: v_writelane_b32 v40, s35, 1
; GFX11-NEXT: v_writelane_b32 v40, s36, 2
; GFX11-NEXT: v_writelane_b32 v40, s37, 3
@@ -26048,6 +26046,8 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX11-NEXT: v_writelane_b32 v40, s49, 7
; GFX11-NEXT: v_writelane_b32 v40, s30, 8
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-NEXT: s_mov_b32 s42, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB49_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s43, s27, 24
@@ -30714,8 +30714,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v16, s30, 0
-; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_writelane_b32 v16, s31, 1
+; SI-NEXT: v_readfirstlane_b32 s4, v2
; SI-NEXT: v_readfirstlane_b32 s31, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s30, v0
@@ -30767,8 +30767,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -30820,8 +30820,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -31801,9 +31801,11 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: v_writelane_b32 v16, s38, 2
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
+; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: v_readfirstlane_b32 s9, v1
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v16, s49, 5
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s8, s28, 16
; SI-NEXT: s_lshr_b32 s10, s27, 16
@@ -31821,9 +31823,7 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: s_lshr_b32 s7, s9, 16
; SI-NEXT: s_lshr_b32 s11, s12, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB59_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -32109,8 +32109,8 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -32761,6 +32761,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v15
; SI-NEXT: v_mov_b32_e32 v17, v14
; SI-NEXT: v_mov_b32_e32 v18, v13
@@ -32777,7 +32778,6 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) #0 {
; SI-NEXT: v_mov_b32_e32 v29, v2
; SI-NEXT: v_mov_b32_e32 v30, v1
; SI-NEXT: v_mov_b32_e32 v31, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18
@@ -33170,9 +33170,11 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32
; SI-NEXT: v_writelane_b32 v18, s38, 2
; SI-NEXT: v_writelane_b32 v18, s39, 3
; SI-NEXT: v_writelane_b32 v18, s48, 4
+; SI-NEXT: v_writelane_b32 v18, s49, 5
+; SI-NEXT: v_writelane_b32 v18, s50, 6
+; SI-NEXT: v_writelane_b32 v18, s51, 7
; SI-NEXT: v_readfirstlane_b32 s6, v1
; SI-NEXT: v_readfirstlane_b32 s8, v0
-; SI-NEXT: v_writelane_b32 v18, s49, 5
; SI-NEXT: s_lshr_b32 s9, s29, 16
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: s_lshr_b32 s12, s27, 16
@@ -33190,9 +33192,7 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32
; SI-NEXT: s_lshr_b32 s7, s6, 16
; SI-NEXT: s_lshr_b32 s10, s8, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v18, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB63_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -33416,8 +33416,8 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v17, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v17, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -33542,8 +33542,8 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -35729,6 +35729,22 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; SI-LABEL: bitcast_v32bf16_to_v8i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s42, v1
; SI-NEXT: v_readfirstlane_b32 s44, v0
; SI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -35764,22 +35780,6 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: s_and_b32 s45, s44, 0xffff0000
; SI-NEXT: s_lshl_b32 s44, s44, 16
; SI-NEXT: v_readfirstlane_b32 s46, v2
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 s46, 0
; SI-NEXT: v_mul_f32_e64 v41, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v52, 1.0, s16
@@ -35978,8 +35978,8 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v20, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -36320,8 +36320,8 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v20, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -37375,8 +37375,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -37393,6 +37391,8 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr46
@@ -37794,10 +37794,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v8i64_to_v64i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -37814,6 +37810,10 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; VI-NEXT: ; implicit-def: $vgpr17
+; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; implicit-def: $vgpr30
; VI-NEXT: ; implicit-def: $vgpr23
@@ -38110,10 +38110,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v8i64_to_v64i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -38130,6 +38126,10 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: ; implicit-def: $vgpr31
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: ; implicit-def: $vgpr24
@@ -38884,12 +38884,12 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; SI-NEXT: v_writelane_b32 v4, s83, 25
; SI-NEXT: v_writelane_b32 v4, s84, 26
; SI-NEXT: v_writelane_b32 v4, s85, 27
-; SI-NEXT: v_readfirstlane_b32 s4, v3
; SI-NEXT: v_writelane_b32 v4, s30, 28
+; SI-NEXT: v_writelane_b32 v4, s31, 29
+; SI-NEXT: v_readfirstlane_b32 s4, v3
; SI-NEXT: v_readfirstlane_b32 s5, v2
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v1
-; SI-NEXT: v_writelane_b32 v4, s31, 29
; SI-NEXT: s_cbranch_scc0 .LBB69_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 24
@@ -39321,12 +39321,12 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; VI-NEXT: v_writelane_b32 v4, s65, 15
; VI-NEXT: v_writelane_b32 v4, s66, 16
; VI-NEXT: v_writelane_b32 v4, s67, 17
-; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_writelane_b32 v4, s30, 18
+; VI-NEXT: v_writelane_b32 v4, s31, 19
+; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: v_writelane_b32 v4, s31, 19
; VI-NEXT: s_cbranch_scc0 .LBB69_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -39668,12 +39668,12 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9-NEXT: v_writelane_b32 v4, s53, 11
; GFX9-NEXT: v_writelane_b32 v4, s54, 12
; GFX9-NEXT: v_writelane_b32 v4, s55, 13
-; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_writelane_b32 v4, s30, 14
+; GFX9-NEXT: v_writelane_b32 v4, s31, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_writelane_b32 v4, s31, 15
; GFX9-NEXT: s_cbranch_scc0 .LBB69_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -39983,8 +39983,6 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX11-NEXT: scratch_store_b32 off, v23, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v23, s34, 0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: v_writelane_b32 v23, s35, 1
; GFX11-NEXT: v_writelane_b32 v23, s36, 2
; GFX11-NEXT: v_writelane_b32 v23, s37, 3
@@ -39993,6 +39991,8 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX11-NEXT: v_writelane_b32 v23, s48, 6
; GFX11-NEXT: v_writelane_b32 v23, s30, 7
; GFX11-NEXT: v_writelane_b32 v23, s31, 8
+; GFX11-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB69_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 24
@@ -44539,8 +44539,8 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -44592,8 +44592,8 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -45039,9 +45039,11 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v16, s38, 2
; SI-NEXT: v_writelane_b32 v16, s39, 3
; SI-NEXT: v_writelane_b32 v16, s48, 4
+; SI-NEXT: v_writelane_b32 v16, s49, 5
+; SI-NEXT: v_writelane_b32 v16, s50, 6
+; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: v_readfirstlane_b32 s9, v1
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v16, s49, 5
; SI-NEXT: s_lshr_b32 s6, s29, 16
; SI-NEXT: s_lshr_b32 s8, s28, 16
; SI-NEXT: s_lshr_b32 s10, s27, 16
@@ -45059,9 +45061,7 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s7, s9, 16
; SI-NEXT: s_lshr_b32 s11, s12, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v16, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v16, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB75_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -45347,8 +45347,8 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -45824,8 +45824,8 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a,
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -45877,8 +45877,8 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -45979,6 +45979,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v15
; SI-NEXT: v_mov_b32_e32 v17, v14
; SI-NEXT: v_mov_b32_e32 v18, v13
@@ -45995,7 +45996,6 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) #0 {
; SI-NEXT: v_mov_b32_e32 v29, v2
; SI-NEXT: v_mov_b32_e32 v30, v1
; SI-NEXT: v_mov_b32_e32 v31, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18
@@ -46388,9 +46388,11 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v18, s38, 2
; SI-NEXT: v_writelane_b32 v18, s39, 3
; SI-NEXT: v_writelane_b32 v18, s48, 4
+; SI-NEXT: v_writelane_b32 v18, s49, 5
+; SI-NEXT: v_writelane_b32 v18, s50, 6
+; SI-NEXT: v_writelane_b32 v18, s51, 7
; SI-NEXT: v_readfirstlane_b32 s6, v1
; SI-NEXT: v_readfirstlane_b32 s8, v0
-; SI-NEXT: v_writelane_b32 v18, s49, 5
; SI-NEXT: s_lshr_b32 s9, s29, 16
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: s_lshr_b32 s12, s27, 16
@@ -46408,9 +46410,7 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s7, s6, 16
; SI-NEXT: s_lshr_b32 s10, s8, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v18, s50, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s51, 7
; SI-NEXT: s_cbranch_scc0 .LBB79_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -46634,8 +46634,8 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a,
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v17, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v17, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -46760,8 +46760,8 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a,
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -47374,8 +47374,8 @@ define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v16, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v16, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -47427,8 +47427,8 @@ define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -48935,6 +48935,22 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; SI-LABEL: bitcast_v32bf16_to_v8f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s42, v1
; SI-NEXT: v_readfirstlane_b32 s44, v0
; SI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -48970,22 +48986,6 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; SI-NEXT: s_and_b32 s45, s44, 0xffff0000
; SI-NEXT: s_lshl_b32 s44, s44, 16
; SI-NEXT: v_readfirstlane_b32 s46, v2
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 s46, 0
; SI-NEXT: v_mul_f32_e64 v41, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v52, 1.0, s16
@@ -49184,8 +49184,8 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v20, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -49526,8 +49526,8 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v20, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -50581,8 +50581,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -50599,6 +50597,8 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr47
@@ -50992,10 +50992,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v8f64_to_v64i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -51012,6 +51008,10 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; VI-NEXT: ; implicit-def: $vgpr17
+; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; implicit-def: $vgpr30
; VI-NEXT: ; implicit-def: $vgpr23
@@ -51300,10 +51300,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v8f64_to_v64i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -51320,6 +51316,10 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: ; implicit-def: $vgpr31
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: ; implicit-def: $vgpr24
@@ -52042,12 +52042,12 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_writelane_b32 v40, s85, 27
; SI-NEXT: v_writelane_b32 v40, s86, 28
; SI-NEXT: v_writelane_b32 v40, s87, 29
-; SI-NEXT: v_readfirstlane_b32 s4, v3
; SI-NEXT: v_writelane_b32 v40, s30, 30
+; SI-NEXT: v_writelane_b32 v40, s31, 31
+; SI-NEXT: v_readfirstlane_b32 s4, v3
; SI-NEXT: v_readfirstlane_b32 s5, v2
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v1
-; SI-NEXT: v_writelane_b32 v40, s31, 31
; SI-NEXT: s_cbranch_scc0 .LBB85_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s48, s5, 24
@@ -52491,6 +52491,21 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v63, s34, 0
; VI-NEXT: v_writelane_b32 v63, s35, 1
; VI-NEXT: v_writelane_b32 v63, s36, 2
@@ -52510,26 +52525,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; VI-NEXT: v_writelane_b32 v63, s66, 16
; VI-NEXT: v_writelane_b32 v63, s67, 17
; VI-NEXT: v_writelane_b32 v63, s30, 18
-; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_writelane_b32 v63, s31, 19
+; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB85_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s35, s5, 24
@@ -52920,6 +52920,21 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -52935,26 +52950,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v63, s54, 12
; GFX9-NEXT: v_writelane_b32 v63, s55, 13
; GFX9-NEXT: v_writelane_b32 v63, s30, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_writelane_b32 v63, s31, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB85_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s95, s5, 24
@@ -53334,8 +53334,6 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v40, s34, 0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s42, 0
; GFX11-NEXT: v_writelane_b32 v40, s35, 1
; GFX11-NEXT: v_writelane_b32 v40, s36, 2
; GFX11-NEXT: v_writelane_b32 v40, s37, 3
@@ -53345,6 +53343,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX11-NEXT: v_writelane_b32 v40, s49, 7
; GFX11-NEXT: v_writelane_b32 v40, s30, 8
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-NEXT: s_mov_b32 s42, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB85_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s90, s27, 24
@@ -57558,22 +57558,6 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -57590,6 +57574,22 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v31
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v32
@@ -58017,9 +58017,9 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v16, s38, 4
; SI-NEXT: v_writelane_b32 v16, s39, 5
; SI-NEXT: v_writelane_b32 v16, s30, 6
+; SI-NEXT: v_writelane_b32 v16, s31, 7
; SI-NEXT: v_readfirstlane_b32 s37, v1
; SI-NEXT: v_readfirstlane_b32 s39, v0
-; SI-NEXT: v_writelane_b32 v16, s31, 7
; SI-NEXT: s_lshr_b32 s92, s29, 16
; SI-NEXT: s_lshr_b32 s36, s28, 16
; SI-NEXT: s_lshr_b32 s91, s27, 16
@@ -58418,8 +58418,8 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -59142,8 +59142,8 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v18, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v18, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -59268,8 +59268,8 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -59793,9 +59793,9 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
; SI-NEXT: v_writelane_b32 v17, s38, 4
; SI-NEXT: v_writelane_b32 v17, s39, 5
; SI-NEXT: v_writelane_b32 v17, s30, 6
+; SI-NEXT: v_writelane_b32 v17, s31, 7
; SI-NEXT: v_readfirstlane_b32 s39, v1
; SI-NEXT: v_readfirstlane_b32 s37, v0
-; SI-NEXT: v_writelane_b32 v17, s31, 7
; SI-NEXT: s_lshr_b32 s35, s29, 16
; SI-NEXT: s_lshr_b32 s34, s28, 16
; SI-NEXT: s_lshr_b32 s31, s27, 16
@@ -60194,8 +60194,8 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -60312,6 +60312,22 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
@@ -60344,22 +60360,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0
@@ -61786,6 +61786,22 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; SI-LABEL: bitcast_v32bf16_to_v32i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s42, v1
; SI-NEXT: v_readfirstlane_b32 s44, v0
; SI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -61821,22 +61837,6 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; SI-NEXT: s_and_b32 s45, s44, 0xffff0000
; SI-NEXT: s_lshl_b32 s44, s44, 16
; SI-NEXT: v_readfirstlane_b32 s46, v2
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 s46, 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v63, 1.0, s16
@@ -62128,8 +62128,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v20, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -62469,8 +62469,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v20, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -63432,6 +63432,22 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; kill: killed $vgpr17
@@ -63490,22 +63506,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; kill: killed $vgpr17
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14
@@ -64209,8 +64209,24 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
;
; VI-LABEL: bitcast_v32i16_to_v64i8:
; VI: ; %bb.0:
-; VI-NEXT: ; implicit-def: $vgpr19
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr19
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; kill: killed $vgpr19
@@ -64263,22 +64279,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr57
; VI-NEXT: ; implicit-def: $vgpr53
@@ -64664,10 +64664,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v32i16_to_v64i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -64684,6 +64680,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: ; implicit-def: $vgpr31
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: ; implicit-def: $vgpr24
@@ -65434,9 +65434,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: v_writelane_b32 v4, s96, 30
; SI-NEXT: v_writelane_b32 v4, s97, 31
; SI-NEXT: v_writelane_b32 v4, s98, 32
+; SI-NEXT: v_writelane_b32 v4, s99, 33
+; SI-NEXT: v_writelane_b32 v4, s30, 34
+; SI-NEXT: v_writelane_b32 v4, s31, 35
; SI-NEXT: v_readfirstlane_b32 s56, v2
; SI-NEXT: v_readfirstlane_b32 s60, v1
-; SI-NEXT: v_writelane_b32 v4, s99, 33
; SI-NEXT: s_lshr_b32 s68, s29, 16
; SI-NEXT: s_lshr_b32 s46, s28, 16
; SI-NEXT: s_lshr_b32 s70, s27, 16
@@ -65454,9 +65456,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: s_lshr_b32 s69, s56, 16
; SI-NEXT: s_lshr_b32 s58, s60, 16
; SI-NEXT: v_readfirstlane_b32 s4, v3
-; SI-NEXT: v_writelane_b32 v4, s30, 34
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v4, s31, 35
; SI-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB97_4
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -66043,12 +66043,12 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI-NEXT: v_writelane_b32 v4, s65, 15
; VI-NEXT: v_writelane_b32 v4, s66, 16
; VI-NEXT: v_writelane_b32 v4, s67, 17
-; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_writelane_b32 v4, s30, 18
+; VI-NEXT: v_writelane_b32 v4, s31, 19
+; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: v_writelane_b32 v4, s31, 19
; VI-NEXT: s_cbranch_scc0 .LBB97_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -66440,6 +66440,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -66455,26 +66470,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v63, s54, 12
; GFX9-NEXT: v_writelane_b32 v63, s55, 13
; GFX9-NEXT: v_writelane_b32 v63, s30, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_writelane_b32 v63, s31, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB97_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -66853,8 +66853,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v40, s34, 0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s42, 0
; GFX11-NEXT: v_writelane_b32 v40, s35, 1
; GFX11-NEXT: v_writelane_b32 v40, s36, 2
; GFX11-NEXT: v_writelane_b32 v40, s37, 3
@@ -66864,6 +66862,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX11-NEXT: v_writelane_b32 v40, s49, 7
; GFX11-NEXT: v_writelane_b32 v40, s30, 8
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-NEXT: s_mov_b32 s42, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB97_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s43, s27, 24
@@ -69691,9 +69691,9 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_writelane_b32 v40, s98, 32
; SI-NEXT: v_writelane_b32 v40, s99, 33
; SI-NEXT: v_writelane_b32 v40, s30, 34
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: v_readfirstlane_b32 s36, v28
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: v_readfirstlane_b32 s89, v30
; SI-NEXT: v_readfirstlane_b32 s90, v29
; SI-NEXT: v_readfirstlane_b32 s88, v27
@@ -69769,14 +69769,15 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_readfirstlane_b32 s87, v33
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s80, v34
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_readfirstlane_b32 s34, v35
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s57, v37
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s47, v38
; SI-NEXT: v_writelane_b32 v41, s47, 5
-; SI-NEXT: v_readfirstlane_b32 s34, v35
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s98, v36
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s20, v39
@@ -72141,9 +72142,11 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
; SI-NEXT: v_writelane_b32 v40, s36, 2
; SI-NEXT: v_writelane_b32 v40, s37, 3
; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s30, 6
+; SI-NEXT: v_writelane_b32 v40, s31, 7
; SI-NEXT: v_readfirstlane_b32 s60, v1
; SI-NEXT: v_readfirstlane_b32 s56, v0
-; SI-NEXT: v_writelane_b32 v40, s39, 5
; SI-NEXT: s_lshr_b32 s43, s29, 16
; SI-NEXT: s_lshr_b32 s42, s28, 16
; SI-NEXT: s_lshr_b32 s41, s27, 16
@@ -72161,9 +72164,7 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
; SI-NEXT: s_lshr_b32 s63, s60, 16
; SI-NEXT: s_lshr_b32 s59, s56, 16
; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v40, s30, 6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v40, s31, 7
; SI-NEXT: s_cbranch_scc0 .LBB101_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshl_b32 s44, s16, 16
@@ -72482,8 +72483,8 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v18, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v18, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -72608,8 +72609,8 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v16, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v16, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -72727,6 +72728,22 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
@@ -72759,22 +72776,6 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31
; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
@@ -74240,6 +74241,22 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_b32 s40, s17, 0xffff0000
; SI-NEXT: s_and_b32 s10, s26, 0xffff0000
; SI-NEXT: s_lshl_b32 s11, s26, 16
@@ -74285,28 +74302,11 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cmp_lg_u32 s46, 0
; SI-NEXT: v_mul_f32_e64 v43, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v40, 1.0, s29
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s28
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v47, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s25
@@ -74321,7 +74321,6 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; SI-NEXT: v_mul_f32_e64 v48, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v38, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v36, 1.0, s13
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mul_f32_e64 v62, 1.0, s9
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s5
; SI-NEXT: s_waitcnt expcnt(0)
@@ -74809,8 +74808,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_writelane_b32 v20, s31, 1
+; VI-NEXT: v_readfirstlane_b32 s4, v2
; VI-NEXT: v_readfirstlane_b32 s31, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s30, v0
@@ -75150,8 +75149,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v20, s30, 0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_writelane_b32 v20, s31, 1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v2
; GFX9-NEXT: v_readfirstlane_b32 s31, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s30, v0
@@ -76196,6 +76195,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; kill: killed $vgpr17
@@ -76239,22 +76254,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; kill: killed $vgpr25
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14
@@ -76963,6 +76962,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v32f16_to_v64i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16
@@ -76983,22 +76998,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1
; VI-NEXT: ; kill: killed $vgpr17
; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr26
; VI-NEXT: ; implicit-def: $vgpr22
; VI-NEXT: ; implicit-def: $vgpr63
@@ -77295,10 +77294,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v32f16_to_v64i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -77315,6 +77310,10 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: ; implicit-def: $vgpr31
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: ; implicit-def: $vgpr24
@@ -78066,9 +78065,11 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; SI-NEXT: v_writelane_b32 v18, s96, 30
; SI-NEXT: v_writelane_b32 v18, s97, 31
; SI-NEXT: v_writelane_b32 v18, s98, 32
+; SI-NEXT: v_writelane_b32 v18, s99, 33
+; SI-NEXT: v_writelane_b32 v18, s30, 34
+; SI-NEXT: v_writelane_b32 v18, s31, 35
; SI-NEXT: v_readfirstlane_b32 s98, v2
; SI-NEXT: v_readfirstlane_b32 s44, v1
-; SI-NEXT: v_writelane_b32 v18, s99, 33
; SI-NEXT: s_lshr_b32 s96, s29, 16
; SI-NEXT: s_lshr_b32 s97, s28, 16
; SI-NEXT: s_lshr_b32 s86, s27, 16
@@ -78086,9 +78087,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; SI-NEXT: s_lshr_b32 s99, s98, 16
; SI-NEXT: s_lshr_b32 s46, s44, 16
; SI-NEXT: v_readfirstlane_b32 s4, v3
-; SI-NEXT: v_writelane_b32 v18, s30, 34
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s31, 35
; SI-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB105_3
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -78722,6 +78721,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v63, s34, 0
; VI-NEXT: v_writelane_b32 v63, s35, 1
; VI-NEXT: v_writelane_b32 v63, s36, 2
@@ -78741,26 +78755,11 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_writelane_b32 v63, s66, 16
; VI-NEXT: v_writelane_b32 v63, s67, 17
; VI-NEXT: v_writelane_b32 v63, s30, 18
-; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_writelane_b32 v63, s31, 19
+; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB105_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s88, s5, 24
@@ -79197,6 +79196,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -79212,26 +79226,11 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v63, s54, 12
; GFX9-NEXT: v_writelane_b32 v63, s55, 13
; GFX9-NEXT: v_writelane_b32 v63, s30, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_writelane_b32 v63, s31, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB105_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -79611,8 +79610,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v40, s34, 0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s42, 0
; GFX11-NEXT: v_writelane_b32 v40, s35, 1
; GFX11-NEXT: v_writelane_b32 v40, s36, 2
; GFX11-NEXT: v_writelane_b32 v40, s37, 3
@@ -79622,6 +79619,8 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX11-NEXT: v_writelane_b32 v40, s49, 7
; GFX11-NEXT: v_writelane_b32 v40, s30, 8
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-NEXT: s_mov_b32 s42, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB105_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s43, s27, 24
@@ -82449,9 +82448,9 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_writelane_b32 v40, s98, 32
; SI-NEXT: v_writelane_b32 v40, s99, 33
; SI-NEXT: v_writelane_b32 v40, s30, 34
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: v_readfirstlane_b32 s36, v28
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: v_readfirstlane_b32 s89, v30
; SI-NEXT: v_readfirstlane_b32 s90, v29
; SI-NEXT: v_readfirstlane_b32 s88, v27
@@ -82527,14 +82526,15 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_readfirstlane_b32 s87, v33
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s80, v34
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_readfirstlane_b32 s34, v35
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s57, v37
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s47, v38
; SI-NEXT: v_writelane_b32 v41, s47, 5
-; SI-NEXT: v_readfirstlane_b32 s34, v35
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s98, v36
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s20, v39
@@ -84397,6 +84397,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v29, 1.0, v1
@@ -84491,22 +84507,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
; SI-NEXT: v_mul_f32_e32 v28, 1.0, v27
; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22
@@ -85236,10 +85236,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v32bf16_to_v64i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -85256,6 +85252,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; VI-NEXT: ; implicit-def: $vgpr17
+; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; implicit-def: $vgpr30
; VI-NEXT: ; implicit-def: $vgpr23
@@ -85825,14 +85825,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v32bf16_to_v64i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
-; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -85849,6 +85841,14 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: ; implicit-def: $vgpr28
; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr24
@@ -87420,9 +87420,11 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_writelane_b32 v40, s96, 30
; SI-NEXT: v_writelane_b32 v40, s97, 31
; SI-NEXT: v_writelane_b32 v40, s98, 32
+; SI-NEXT: v_writelane_b32 v40, s99, 33
+; SI-NEXT: v_writelane_b32 v40, s30, 34
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: v_readfirstlane_b32 s42, v2
; SI-NEXT: v_readfirstlane_b32 s44, v1
-; SI-NEXT: v_writelane_b32 v40, s99, 33
; SI-NEXT: s_and_b32 s4, s29, 0xffff0000
; SI-NEXT: s_lshl_b32 s5, s29, 16
; SI-NEXT: s_and_b32 s6, s28, 0xffff0000
@@ -87456,7 +87458,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: s_and_b32 s45, s44, 0xffff0000
; SI-NEXT: s_lshl_b32 s44, s44, 16
; SI-NEXT: v_readfirstlane_b32 s46, v3
-; SI-NEXT: v_writelane_b32 v40, s30, 34
; SI-NEXT: s_cmp_lg_u32 s46, 0
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s41
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16
@@ -87490,7 +87491,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_mul_f32_e64 v37, 1.0, s44
; SI-NEXT: v_mul_f32_e64 v33, 1.0, s43
; SI-NEXT: v_mul_f32_e64 v35, 1.0, s42
-; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB109_4
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -88107,6 +88107,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v63, s34, 0
; VI-NEXT: v_writelane_b32 v63, s35, 1
; VI-NEXT: v_writelane_b32 v63, s36, 2
@@ -88126,26 +88141,11 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_writelane_b32 v63, s66, 16
; VI-NEXT: v_writelane_b32 v63, s67, 17
; VI-NEXT: v_writelane_b32 v63, s30, 18
-; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_writelane_b32 v63, s31, 19
+; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB109_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s65, s5, 24
@@ -88810,6 +88810,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s34, 0
; GFX9-NEXT: v_writelane_b32 v63, s35, 1
; GFX9-NEXT: v_writelane_b32 v63, s36, 2
@@ -88825,26 +88840,11 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: v_writelane_b32 v63, s54, 12
; GFX9-NEXT: v_writelane_b32 v63, s55, 13
; GFX9-NEXT: v_writelane_b32 v63, s30, 14
-; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_writelane_b32 v63, s31, 15
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB109_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s91, s5, 24
@@ -89500,8 +89500,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 0
-; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 1
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 2
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 3
@@ -89511,6 +89509,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 7
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 8
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s27, 24
@@ -90094,8 +90094,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 0
-; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 1
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 2
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 3
@@ -90105,6 +90103,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 7
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s27, 24
@@ -93205,10 +93205,42 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_writelane_b32 v40, s34, 0
; SI-NEXT: v_writelane_b32 v40, s35, 1
; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
+; SI-NEXT: v_writelane_b32 v40, s96, 30
+; SI-NEXT: v_writelane_b32 v40, s97, 31
+; SI-NEXT: v_writelane_b32 v40, s98, 32
+; SI-NEXT: v_writelane_b32 v40, s99, 33
+; SI-NEXT: v_writelane_b32 v40, s30, 34
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: s_mov_b32 s6, s19
; SI-NEXT: v_readfirstlane_b32 s19, v28
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT: v_writelane_b32 v40, s37, 3
; SI-NEXT: v_readfirstlane_b32 s8, v30
; SI-NEXT: v_readfirstlane_b32 s37, v29
; SI-NEXT: v_readfirstlane_b32 s47, v27
@@ -93234,53 +93266,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT: v_writelane_b32 v40, s38, 4
-; SI-NEXT: v_writelane_b32 v40, s39, 5
-; SI-NEXT: v_writelane_b32 v40, s48, 6
-; SI-NEXT: v_writelane_b32 v40, s49, 7
-; SI-NEXT: v_writelane_b32 v40, s50, 8
-; SI-NEXT: v_writelane_b32 v40, s51, 9
-; SI-NEXT: v_writelane_b32 v40, s52, 10
-; SI-NEXT: v_writelane_b32 v40, s53, 11
-; SI-NEXT: v_writelane_b32 v40, s54, 12
-; SI-NEXT: v_writelane_b32 v40, s55, 13
-; SI-NEXT: v_writelane_b32 v40, s64, 14
-; SI-NEXT: v_writelane_b32 v40, s65, 15
-; SI-NEXT: v_writelane_b32 v40, s66, 16
-; SI-NEXT: v_writelane_b32 v40, s67, 17
-; SI-NEXT: v_writelane_b32 v40, s68, 18
-; SI-NEXT: v_writelane_b32 v40, s69, 19
-; SI-NEXT: v_writelane_b32 v40, s70, 20
; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_readfirstlane_b32 s75, v23
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v41, s18, 0
-; SI-NEXT: v_writelane_b32 v40, s80, 22
; SI-NEXT: v_writelane_b32 v41, s6, 1
-; SI-NEXT: v_writelane_b32 v40, s81, 23
; SI-NEXT: v_writelane_b32 v41, s17, 2
-; SI-NEXT: v_writelane_b32 v40, s82, 24
; SI-NEXT: v_writelane_b32 v41, s16, 3
-; SI-NEXT: v_writelane_b32 v40, s83, 25
; SI-NEXT: v_writelane_b32 v41, s22, 4
-; SI-NEXT: v_writelane_b32 v40, s84, 26
; SI-NEXT: v_writelane_b32 v41, s23, 5
-; SI-NEXT: v_writelane_b32 v40, s85, 27
; SI-NEXT: v_writelane_b32 v41, s21, 6
-; SI-NEXT: v_writelane_b32 v40, s86, 28
; SI-NEXT: v_writelane_b32 v41, s20, 7
-; SI-NEXT: v_writelane_b32 v40, s87, 29
; SI-NEXT: v_writelane_b32 v41, s26, 8
-; SI-NEXT: v_writelane_b32 v40, s96, 30
; SI-NEXT: v_writelane_b32 v41, s27, 9
-; SI-NEXT: v_writelane_b32 v40, s97, 31
; SI-NEXT: v_writelane_b32 v41, s25, 10
-; SI-NEXT: v_writelane_b32 v40, s98, 32
; SI-NEXT: v_writelane_b32 v41, s24, 11
-; SI-NEXT: v_writelane_b32 v40, s99, 33
; SI-NEXT: v_writelane_b32 v41, s29, 12
-; SI-NEXT: v_writelane_b32 v40, s30, 34
-; SI-NEXT: v_readfirstlane_b32 s75, v23
; SI-NEXT: v_readfirstlane_b32 s76, v22
; SI-NEXT: v_readfirstlane_b32 s62, v21
; SI-NEXT: v_readfirstlane_b32 s63, v20
@@ -93304,12 +93305,12 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s42, v2
; SI-NEXT: v_readfirstlane_b32 s7, v1
; SI-NEXT: v_readfirstlane_b32 s10, v0
+; SI-NEXT: v_writelane_b32 v41, s28, 13
+; SI-NEXT: v_writelane_b32 v41, s7, 14
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s87, v24
; SI-NEXT: v_readfirstlane_b32 s86, v25
; SI-NEXT: v_readfirstlane_b32 s98, v26
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s99, v27
; SI-NEXT: v_readfirstlane_b32 s82, v29
; SI-NEXT: s_waitcnt vmcnt(13)
@@ -93324,6 +93325,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s67, v34
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s70, v35
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s80, v36
; SI-NEXT: s_waitcnt vmcnt(6)
@@ -93340,9 +93343,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s39, v50
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s51, v51
-; SI-NEXT: v_writelane_b32 v41, s28, 13
-; SI-NEXT: v_writelane_b32 v40, s31, 35
-; SI-NEXT: v_writelane_b32 v41, s7, 14
; SI-NEXT: s_cbranch_scc0 .LBB111_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xff
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index e04fb2918a8ca..9a74ac7e2b943 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -521,8 +521,8 @@ define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s50, 6
; SI-NEXT: v_writelane_b32 v32, s51, 7
; SI-NEXT: v_writelane_b32 v32, s52, 8
-; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: v_writelane_b32 v32, s53, 9
+; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -632,8 +632,8 @@ define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s52, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: v_writelane_b32 v32, s53, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -743,8 +743,8 @@ define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -846,36 +846,36 @@ define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -2175,8 +2175,8 @@ define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s50, 6
; SI-NEXT: v_writelane_b32 v32, s51, 7
; SI-NEXT: v_writelane_b32 v32, s52, 8
-; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: v_writelane_b32 v32, s53, 9
+; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -2277,8 +2277,8 @@ define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s52, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: v_writelane_b32 v32, s53, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -2379,8 +2379,8 @@ define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -2473,36 +2473,36 @@ define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -4113,7 +4113,6 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v36i16_to_v18i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -4130,6 +4129,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: v_mov_b32_e32 v33, v16
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -4445,11 +4445,13 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v18, s54, 10
; SI-NEXT: v_writelane_b32 v18, s55, 11
; SI-NEXT: v_writelane_b32 v18, s64, 12
+; SI-NEXT: v_writelane_b32 v18, s65, 13
+; SI-NEXT: v_writelane_b32 v18, s66, 14
+; SI-NEXT: v_writelane_b32 v18, s67, 15
; SI-NEXT: v_readfirstlane_b32 s7, v3
; SI-NEXT: v_readfirstlane_b32 s10, v2
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: v_readfirstlane_b32 s72, v0
-; SI-NEXT: v_writelane_b32 v18, s65, 13
; SI-NEXT: s_lshr_b32 s8, s29, 16
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: s_lshr_b32 s14, s27, 16
@@ -4469,9 +4471,7 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3
; SI-NEXT: s_lshr_b32 s12, s13, 16
; SI-NEXT: s_lshr_b32 s15, s72, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v18, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB15_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -4683,11 +4683,13 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v18, s54, 10
; VI-NEXT: v_writelane_b32 v18, s55, 11
; VI-NEXT: v_writelane_b32 v18, s64, 12
+; VI-NEXT: v_writelane_b32 v18, s65, 13
+; VI-NEXT: v_writelane_b32 v18, s66, 14
+; VI-NEXT: v_writelane_b32 v18, s67, 15
; VI-NEXT: v_readfirstlane_b32 s7, v3
; VI-NEXT: v_readfirstlane_b32 s10, v2
; VI-NEXT: v_readfirstlane_b32 s13, v1
; VI-NEXT: v_readfirstlane_b32 s72, v0
-; VI-NEXT: v_writelane_b32 v18, s65, 13
; VI-NEXT: s_lshr_b32 s8, s29, 16
; VI-NEXT: s_lshr_b32 s11, s28, 16
; VI-NEXT: s_lshr_b32 s14, s27, 16
@@ -4707,9 +4709,7 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3
; VI-NEXT: s_lshr_b32 s12, s13, 16
; VI-NEXT: s_lshr_b32 s15, s72, 16
; VI-NEXT: v_readfirstlane_b32 s4, v4
-; VI-NEXT: v_writelane_b32 v18, s66, 14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: v_writelane_b32 v18, s67, 15
; VI-NEXT: s_cbranch_scc0 .LBB15_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_and_b32 s4, 0xffff, s16
@@ -4916,11 +4916,12 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s49, 5
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
+; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_readfirstlane_b32 s58, v2
; GFX9-NEXT: v_readfirstlane_b32 s60, v1
; GFX9-NEXT: v_readfirstlane_b32 s62, v0
-; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -4940,7 +4941,6 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s61, s60, 16
; GFX9-NEXT: s_lshr_b32 s63, s62, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v4
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -6255,7 +6255,6 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v32, v17
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -6272,6 +6271,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v32, v17
; SI-NEXT: v_mov_b32_e32 v33, v16
; SI-NEXT: v_mov_b32_e32 v41, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -6754,7 +6754,6 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v36f16_to_v18i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -6771,6 +6770,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: v_mov_b32_e32 v33, v16
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -7087,11 +7087,13 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
+; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_writelane_b32 v32, s66, 14
+; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: v_readfirstlane_b32 s6, v3
; SI-NEXT: v_readfirstlane_b32 s8, v2
; SI-NEXT: v_readfirstlane_b32 s10, v1
; SI-NEXT: v_readfirstlane_b32 s13, v0
-; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: s_lshr_b32 s11, s29, 16
; SI-NEXT: s_lshr_b32 s14, s28, 16
; SI-NEXT: s_lshr_b32 s72, s27, 16
@@ -7111,9 +7113,7 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s12, s10, 16
; SI-NEXT: s_lshr_b32 s15, s13, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v32, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB19_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -7395,11 +7395,13 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
+; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_writelane_b32 v32, s66, 14
+; VI-NEXT: v_writelane_b32 v32, s67, 15
; VI-NEXT: v_readfirstlane_b32 s6, v3
; VI-NEXT: v_readfirstlane_b32 s9, v2
; VI-NEXT: v_readfirstlane_b32 s12, v1
; VI-NEXT: v_readfirstlane_b32 s15, v0
-; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: s_lshr_b32 s8, s29, 16
; VI-NEXT: s_lshr_b32 s10, s28, 16
; VI-NEXT: s_lshr_b32 s13, s27, 16
@@ -7419,9 +7421,7 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i
; VI-NEXT: s_lshr_b32 s14, s12, 16
; VI-NEXT: s_lshr_b32 s73, s15, 16
; VI-NEXT: v_readfirstlane_b32 s4, v4
-; VI-NEXT: v_writelane_b32 v32, s66, 14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: v_writelane_b32 v32, s67, 15
; VI-NEXT: s_cbranch_scc0 .LBB19_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_and_b32 s4, 0xffff, s16
@@ -7627,11 +7627,12 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s49, 5
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
+; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_readfirstlane_b32 s58, v2
; GFX9-NEXT: v_readfirstlane_b32 s60, v1
; GFX9-NEXT: v_readfirstlane_b32 s62, v0
-; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -7651,7 +7652,6 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s61, s60, 16
; GFX9-NEXT: s_lshr_b32 s63, s62, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v4
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -8000,8 +8000,8 @@ define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i3
; SI-NEXT: v_writelane_b32 v32, s50, 6
; SI-NEXT: v_writelane_b32 v32, s51, 7
; SI-NEXT: v_writelane_b32 v32, s52, 8
-; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: v_writelane_b32 v32, s53, 9
+; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -8111,8 +8111,8 @@ define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i3
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s52, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: v_writelane_b32 v32, s53, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -8222,8 +8222,8 @@ define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -8325,36 +8325,36 @@ define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
; GFX11-NEXT: s_cbranch_scc0 .LBB21_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -8951,8 +8951,8 @@ define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s50, 6
; SI-NEXT: v_writelane_b32 v32, s51, 7
; SI-NEXT: v_writelane_b32 v32, s52, 8
-; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: v_writelane_b32 v32, s53, 9
+; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -9062,8 +9062,8 @@ define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s52, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: v_writelane_b32 v32, s53, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -9173,8 +9173,8 @@ define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -9276,36 +9276,36 @@ define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
; GFX11-NEXT: s_cbranch_scc0 .LBB25_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -9505,8 +9505,8 @@ define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s50, 6
; SI-NEXT: v_writelane_b32 v32, s51, 7
; SI-NEXT: v_writelane_b32 v32, s52, 8
-; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: v_writelane_b32 v32, s53, 9
+; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -9607,8 +9607,8 @@ define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s52, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: v_writelane_b32 v32, s53, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -9709,8 +9709,8 @@ define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -9803,36 +9803,36 @@ define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -11640,7 +11640,6 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v36i16_to_v18f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -11657,6 +11656,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: v_mov_b32_e32 v33, v16
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -11972,11 +11972,13 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v18, s54, 10
; SI-NEXT: v_writelane_b32 v18, s55, 11
; SI-NEXT: v_writelane_b32 v18, s64, 12
+; SI-NEXT: v_writelane_b32 v18, s65, 13
+; SI-NEXT: v_writelane_b32 v18, s66, 14
+; SI-NEXT: v_writelane_b32 v18, s67, 15
; SI-NEXT: v_readfirstlane_b32 s7, v3
; SI-NEXT: v_readfirstlane_b32 s10, v2
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: v_readfirstlane_b32 s72, v0
-; SI-NEXT: v_writelane_b32 v18, s65, 13
; SI-NEXT: s_lshr_b32 s8, s29, 16
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: s_lshr_b32 s14, s27, 16
@@ -11996,9 +11998,7 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a,
; SI-NEXT: s_lshr_b32 s12, s13, 16
; SI-NEXT: s_lshr_b32 s15, s72, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v18, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB31_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -12210,11 +12210,13 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v18, s54, 10
; VI-NEXT: v_writelane_b32 v18, s55, 11
; VI-NEXT: v_writelane_b32 v18, s64, 12
+; VI-NEXT: v_writelane_b32 v18, s65, 13
+; VI-NEXT: v_writelane_b32 v18, s66, 14
+; VI-NEXT: v_writelane_b32 v18, s67, 15
; VI-NEXT: v_readfirstlane_b32 s7, v3
; VI-NEXT: v_readfirstlane_b32 s10, v2
; VI-NEXT: v_readfirstlane_b32 s13, v1
; VI-NEXT: v_readfirstlane_b32 s72, v0
-; VI-NEXT: v_writelane_b32 v18, s65, 13
; VI-NEXT: s_lshr_b32 s8, s29, 16
; VI-NEXT: s_lshr_b32 s11, s28, 16
; VI-NEXT: s_lshr_b32 s14, s27, 16
@@ -12234,9 +12236,7 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a,
; VI-NEXT: s_lshr_b32 s12, s13, 16
; VI-NEXT: s_lshr_b32 s15, s72, 16
; VI-NEXT: v_readfirstlane_b32 s4, v4
-; VI-NEXT: v_writelane_b32 v18, s66, 14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: v_writelane_b32 v18, s67, 15
; VI-NEXT: s_cbranch_scc0 .LBB31_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_and_b32 s4, 0xffff, s16
@@ -12443,11 +12443,12 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s49, 5
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
+; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_readfirstlane_b32 s58, v2
; GFX9-NEXT: v_readfirstlane_b32 s60, v1
; GFX9-NEXT: v_readfirstlane_b32 s62, v0
-; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -12467,7 +12468,6 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s61, s60, 16
; GFX9-NEXT: s_lshr_b32 s63, s62, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v4
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -13979,7 +13979,6 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v32, v17
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -13996,6 +13995,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v32, v17
; SI-NEXT: v_mov_b32_e32 v33, v16
; SI-NEXT: v_mov_b32_e32 v41, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -14478,7 +14478,6 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v36f16_to_v18f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -14495,6 +14494,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: v_mov_b32_e32 v33, v16
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -14811,11 +14811,13 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
+; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_writelane_b32 v32, s66, 14
+; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: v_readfirstlane_b32 s6, v3
; SI-NEXT: v_readfirstlane_b32 s8, v2
; SI-NEXT: v_readfirstlane_b32 s10, v1
; SI-NEXT: v_readfirstlane_b32 s13, v0
-; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: s_lshr_b32 s11, s29, 16
; SI-NEXT: s_lshr_b32 s14, s28, 16
; SI-NEXT: s_lshr_b32 s72, s27, 16
@@ -14835,9 +14837,7 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s12, s10, 16
; SI-NEXT: s_lshr_b32 s15, s13, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v32, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB35_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -15119,11 +15119,13 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
+; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_writelane_b32 v32, s66, 14
+; VI-NEXT: v_writelane_b32 v32, s67, 15
; VI-NEXT: v_readfirstlane_b32 s6, v3
; VI-NEXT: v_readfirstlane_b32 s9, v2
; VI-NEXT: v_readfirstlane_b32 s12, v1
; VI-NEXT: v_readfirstlane_b32 s15, v0
-; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: s_lshr_b32 s8, s29, 16
; VI-NEXT: s_lshr_b32 s10, s28, 16
; VI-NEXT: s_lshr_b32 s13, s27, 16
@@ -15143,9 +15145,7 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a,
; VI-NEXT: s_lshr_b32 s14, s12, 16
; VI-NEXT: s_lshr_b32 s73, s15, 16
; VI-NEXT: v_readfirstlane_b32 s4, v4
-; VI-NEXT: v_writelane_b32 v32, s66, 14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: v_writelane_b32 v32, s67, 15
; VI-NEXT: s_cbranch_scc0 .LBB35_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_and_b32 s4, 0xffff, s16
@@ -15351,11 +15351,12 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s49, 5
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
+; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_readfirstlane_b32 s58, v2
; GFX9-NEXT: v_readfirstlane_b32 s60, v1
; GFX9-NEXT: v_readfirstlane_b32 s62, v0
-; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -15375,7 +15376,6 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a,
; GFX9-NEXT: s_lshr_b32 s61, s60, 16
; GFX9-NEXT: s_lshr_b32 s63, s62, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v4
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -16066,8 +16066,8 @@ define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32
; SI-NEXT: v_writelane_b32 v32, s50, 6
; SI-NEXT: v_writelane_b32 v32, s51, 7
; SI-NEXT: v_writelane_b32 v32, s52, 8
-; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: v_writelane_b32 v32, s53, 9
+; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -16168,8 +16168,8 @@ define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32
; VI-NEXT: v_writelane_b32 v32, s50, 6
; VI-NEXT: v_writelane_b32 v32, s51, 7
; VI-NEXT: v_writelane_b32 v32, s52, 8
-; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: v_writelane_b32 v32, s53, 9
+; VI-NEXT: v_readfirstlane_b32 s4, v4
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -16270,8 +16270,8 @@ define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -16364,36 +16364,36 @@ define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -18014,7 +18014,6 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v36i16_to_v9i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -18031,6 +18030,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: v_mov_b32_e32 v33, v16
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -18346,11 +18346,13 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32
; SI-NEXT: v_writelane_b32 v18, s54, 10
; SI-NEXT: v_writelane_b32 v18, s55, 11
; SI-NEXT: v_writelane_b32 v18, s64, 12
+; SI-NEXT: v_writelane_b32 v18, s65, 13
+; SI-NEXT: v_writelane_b32 v18, s66, 14
+; SI-NEXT: v_writelane_b32 v18, s67, 15
; SI-NEXT: v_readfirstlane_b32 s7, v3
; SI-NEXT: v_readfirstlane_b32 s10, v2
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: v_readfirstlane_b32 s72, v0
-; SI-NEXT: v_writelane_b32 v18, s65, 13
; SI-NEXT: s_lshr_b32 s8, s29, 16
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: s_lshr_b32 s14, s27, 16
@@ -18370,9 +18372,7 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32
; SI-NEXT: s_lshr_b32 s12, s13, 16
; SI-NEXT: s_lshr_b32 s15, s72, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v18, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB43_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -18584,11 +18584,13 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32
; VI-NEXT: v_writelane_b32 v18, s54, 10
; VI-NEXT: v_writelane_b32 v18, s55, 11
; VI-NEXT: v_writelane_b32 v18, s64, 12
+; VI-NEXT: v_writelane_b32 v18, s65, 13
+; VI-NEXT: v_writelane_b32 v18, s66, 14
+; VI-NEXT: v_writelane_b32 v18, s67, 15
; VI-NEXT: v_readfirstlane_b32 s7, v3
; VI-NEXT: v_readfirstlane_b32 s10, v2
; VI-NEXT: v_readfirstlane_b32 s13, v1
; VI-NEXT: v_readfirstlane_b32 s72, v0
-; VI-NEXT: v_writelane_b32 v18, s65, 13
; VI-NEXT: s_lshr_b32 s8, s29, 16
; VI-NEXT: s_lshr_b32 s11, s28, 16
; VI-NEXT: s_lshr_b32 s14, s27, 16
@@ -18608,9 +18610,7 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32
; VI-NEXT: s_lshr_b32 s12, s13, 16
; VI-NEXT: s_lshr_b32 s15, s72, 16
; VI-NEXT: v_readfirstlane_b32 s4, v4
-; VI-NEXT: v_writelane_b32 v18, s66, 14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: v_writelane_b32 v18, s67, 15
; VI-NEXT: s_cbranch_scc0 .LBB43_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_and_b32 s4, 0xffff, s16
@@ -18817,11 +18817,12 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v32, s49, 5
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
+; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_readfirstlane_b32 s58, v2
; GFX9-NEXT: v_readfirstlane_b32 s60, v1
; GFX9-NEXT: v_readfirstlane_b32 s62, v0
-; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -18841,7 +18842,6 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32
; GFX9-NEXT: s_lshr_b32 s61, s60, 16
; GFX9-NEXT: s_lshr_b32 s63, s62, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v4
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -20166,7 +20166,6 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v32, v17
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -20183,6 +20182,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v32, v17
; SI-NEXT: v_mov_b32_e32 v33, v16
; SI-NEXT: v_mov_b32_e32 v41, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -20665,7 +20665,6 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v36f16_to_v9i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -20682,6 +20681,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: v_mov_b32_e32 v33, v16
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -20998,11 +20998,13 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
+; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_writelane_b32 v32, s66, 14
+; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: v_readfirstlane_b32 s6, v3
; SI-NEXT: v_readfirstlane_b32 s8, v2
; SI-NEXT: v_readfirstlane_b32 s10, v1
; SI-NEXT: v_readfirstlane_b32 s13, v0
-; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: s_lshr_b32 s11, s29, 16
; SI-NEXT: s_lshr_b32 s14, s28, 16
; SI-NEXT: s_lshr_b32 s72, s27, 16
@@ -21022,9 +21024,7 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32
; SI-NEXT: s_lshr_b32 s12, s10, 16
; SI-NEXT: s_lshr_b32 s15, s13, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v32, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB47_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -21306,11 +21306,13 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
+; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_writelane_b32 v32, s66, 14
+; VI-NEXT: v_writelane_b32 v32, s67, 15
; VI-NEXT: v_readfirstlane_b32 s6, v3
; VI-NEXT: v_readfirstlane_b32 s9, v2
; VI-NEXT: v_readfirstlane_b32 s12, v1
; VI-NEXT: v_readfirstlane_b32 s15, v0
-; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: s_lshr_b32 s8, s29, 16
; VI-NEXT: s_lshr_b32 s10, s28, 16
; VI-NEXT: s_lshr_b32 s13, s27, 16
@@ -21330,9 +21332,7 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32
; VI-NEXT: s_lshr_b32 s14, s12, 16
; VI-NEXT: s_lshr_b32 s73, s15, 16
; VI-NEXT: v_readfirstlane_b32 s4, v4
-; VI-NEXT: v_writelane_b32 v32, s66, 14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: v_writelane_b32 v32, s67, 15
; VI-NEXT: s_cbranch_scc0 .LBB47_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_and_b32 s4, 0xffff, s16
@@ -21538,11 +21538,12 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32
; GFX9-NEXT: v_writelane_b32 v32, s49, 5
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
+; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_readfirstlane_b32 s58, v2
; GFX9-NEXT: v_readfirstlane_b32 s60, v1
; GFX9-NEXT: v_readfirstlane_b32 s62, v0
-; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -21562,7 +21563,6 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32
; GFX9-NEXT: s_lshr_b32 s61, s60, 16
; GFX9-NEXT: s_lshr_b32 s63, s62, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v4
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -23429,7 +23429,6 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v36i16_to_v9f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -23446,6 +23445,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: v_mov_b32_e32 v33, v16
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -23761,11 +23761,13 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v18, s54, 10
; SI-NEXT: v_writelane_b32 v18, s55, 11
; SI-NEXT: v_writelane_b32 v18, s64, 12
+; SI-NEXT: v_writelane_b32 v18, s65, 13
+; SI-NEXT: v_writelane_b32 v18, s66, 14
+; SI-NEXT: v_writelane_b32 v18, s67, 15
; SI-NEXT: v_readfirstlane_b32 s7, v3
; SI-NEXT: v_readfirstlane_b32 s10, v2
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: v_readfirstlane_b32 s72, v0
-; SI-NEXT: v_writelane_b32 v18, s65, 13
; SI-NEXT: s_lshr_b32 s8, s29, 16
; SI-NEXT: s_lshr_b32 s11, s28, 16
; SI-NEXT: s_lshr_b32 s14, s27, 16
@@ -23785,9 +23787,7 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s12, s13, 16
; SI-NEXT: s_lshr_b32 s15, s72, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v18, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v18, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB51_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -23999,11 +23999,13 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i
; VI-NEXT: v_writelane_b32 v18, s54, 10
; VI-NEXT: v_writelane_b32 v18, s55, 11
; VI-NEXT: v_writelane_b32 v18, s64, 12
+; VI-NEXT: v_writelane_b32 v18, s65, 13
+; VI-NEXT: v_writelane_b32 v18, s66, 14
+; VI-NEXT: v_writelane_b32 v18, s67, 15
; VI-NEXT: v_readfirstlane_b32 s7, v3
; VI-NEXT: v_readfirstlane_b32 s10, v2
; VI-NEXT: v_readfirstlane_b32 s13, v1
; VI-NEXT: v_readfirstlane_b32 s72, v0
-; VI-NEXT: v_writelane_b32 v18, s65, 13
; VI-NEXT: s_lshr_b32 s8, s29, 16
; VI-NEXT: s_lshr_b32 s11, s28, 16
; VI-NEXT: s_lshr_b32 s14, s27, 16
@@ -24023,9 +24025,7 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i
; VI-NEXT: s_lshr_b32 s12, s13, 16
; VI-NEXT: s_lshr_b32 s15, s72, 16
; VI-NEXT: v_readfirstlane_b32 s4, v4
-; VI-NEXT: v_writelane_b32 v18, s66, 14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: v_writelane_b32 v18, s67, 15
; VI-NEXT: s_cbranch_scc0 .LBB51_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_and_b32 s4, 0xffff, s16
@@ -24232,11 +24232,12 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s49, 5
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
+; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_readfirstlane_b32 s58, v2
; GFX9-NEXT: v_readfirstlane_b32 s60, v1
; GFX9-NEXT: v_readfirstlane_b32 s62, v0
-; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -24256,7 +24257,6 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s61, s60, 16
; GFX9-NEXT: s_lshr_b32 s63, s62, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v4
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -25696,7 +25696,6 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v32, v17
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -25713,6 +25712,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v32, v17
; SI-NEXT: v_mov_b32_e32 v33, v16
; SI-NEXT: v_mov_b32_e32 v41, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -26195,7 +26195,6 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v36f16_to_v9f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -26212,6 +26211,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v17
; GFX9-NEXT: v_mov_b32_e32 v33, v16
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -26528,11 +26528,13 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
+; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_writelane_b32 v32, s66, 14
+; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: v_readfirstlane_b32 s6, v3
; SI-NEXT: v_readfirstlane_b32 s8, v2
; SI-NEXT: v_readfirstlane_b32 s10, v1
; SI-NEXT: v_readfirstlane_b32 s13, v0
-; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: s_lshr_b32 s11, s29, 16
; SI-NEXT: s_lshr_b32 s14, s28, 16
; SI-NEXT: s_lshr_b32 s72, s27, 16
@@ -26552,9 +26554,7 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s12, s10, 16
; SI-NEXT: s_lshr_b32 s15, s13, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v32, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB55_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -26836,11 +26836,13 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
+; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_writelane_b32 v32, s66, 14
+; VI-NEXT: v_writelane_b32 v32, s67, 15
; VI-NEXT: v_readfirstlane_b32 s6, v3
; VI-NEXT: v_readfirstlane_b32 s9, v2
; VI-NEXT: v_readfirstlane_b32 s12, v1
; VI-NEXT: v_readfirstlane_b32 s15, v0
-; VI-NEXT: v_writelane_b32 v32, s65, 13
; VI-NEXT: s_lshr_b32 s8, s29, 16
; VI-NEXT: s_lshr_b32 s10, s28, 16
; VI-NEXT: s_lshr_b32 s13, s27, 16
@@ -26860,9 +26862,7 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a,
; VI-NEXT: s_lshr_b32 s14, s12, 16
; VI-NEXT: s_lshr_b32 s73, s15, 16
; VI-NEXT: v_readfirstlane_b32 s4, v4
-; VI-NEXT: v_writelane_b32 v32, s66, 14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: v_writelane_b32 v32, s67, 15
; VI-NEXT: s_cbranch_scc0 .LBB55_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_and_b32 s4, 0xffff, s16
@@ -27068,11 +27068,12 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s49, 5
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
+; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_readfirstlane_b32 s58, v2
; GFX9-NEXT: v_readfirstlane_b32 s60, v1
; GFX9-NEXT: v_readfirstlane_b32 s62, v0
-; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -27092,7 +27093,6 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a,
; GFX9-NEXT: s_lshr_b32 s61, s60, 16
; GFX9-NEXT: s_lshr_b32 s63, s62, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v4
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -27295,6 +27295,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; kill: killed $vgpr18
@@ -27327,22 +27343,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) #0 {
; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v0
; SI-NEXT: ; kill: killed $vgpr18
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v35
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v36
@@ -28068,11 +28068,11 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v18, s54, 12
; SI-NEXT: v_writelane_b32 v18, s55, 13
; SI-NEXT: v_writelane_b32 v18, s30, 14
+; SI-NEXT: v_writelane_b32 v18, s31, 15
; SI-NEXT: v_readfirstlane_b32 s53, v3
; SI-NEXT: v_readfirstlane_b32 s55, v2
; SI-NEXT: v_readfirstlane_b32 s50, v1
; SI-NEXT: v_readfirstlane_b32 s52, v0
-; SI-NEXT: v_writelane_b32 v18, s31, 15
; SI-NEXT: s_lshr_b32 s30, s29, 16
; SI-NEXT: s_lshr_b32 s49, s28, 16
; SI-NEXT: s_lshr_b32 s95, s27, 16
@@ -29592,6 +29592,12 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
; SI-LABEL: bitcast_v36f16_to_v36i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s8, v3
; SI-NEXT: v_readfirstlane_b32 s6, v2
; SI-NEXT: v_readfirstlane_b32 s9, v1
@@ -29616,12 +29622,6 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s44, s7, 16
; SI-NEXT: v_readfirstlane_b32 s4, v4
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB59_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB59_4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index b59e14fe0da33..69562f9d0a70d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -561,8 +561,8 @@ define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -680,8 +680,8 @@ define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -799,8 +799,8 @@ define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -908,40 +908,40 @@ define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -2344,8 +2344,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -2453,8 +2453,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -2562,8 +2562,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -2661,40 +2661,40 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -4484,7 +4484,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v40i16_to_v20i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -4501,6 +4500,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v18
; GFX9-NEXT: v_mov_b32_e32 v43, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -4858,13 +4858,15 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v20, s54, 10
; SI-NEXT: v_writelane_b32 v20, s55, 11
; SI-NEXT: v_writelane_b32 v20, s64, 12
+; SI-NEXT: v_writelane_b32 v20, s65, 13
+; SI-NEXT: v_writelane_b32 v20, s66, 14
+; SI-NEXT: v_writelane_b32 v20, s67, 15
; SI-NEXT: v_readfirstlane_b32 s7, v5
; SI-NEXT: v_readfirstlane_b32 s9, v4
; SI-NEXT: v_readfirstlane_b32 s12, v3
; SI-NEXT: v_readfirstlane_b32 s15, v2
; SI-NEXT: v_readfirstlane_b32 s74, v1
; SI-NEXT: v_readfirstlane_b32 s77, v0
-; SI-NEXT: v_writelane_b32 v20, s65, 13
; SI-NEXT: s_lshr_b32 s11, s29, 16
; SI-NEXT: s_lshr_b32 s13, s28, 16
; SI-NEXT: s_lshr_b32 s72, s27, 16
@@ -4886,9 +4888,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; SI-NEXT: s_lshr_b32 s73, s74, 16
; SI-NEXT: s_lshr_b32 s76, s77, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
-; SI-NEXT: v_writelane_b32 v20, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v20, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB15_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -5124,13 +5124,13 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v20, s66, 16
; VI-NEXT: v_writelane_b32 v20, s67, 17
; VI-NEXT: v_writelane_b32 v20, s30, 18
+; VI-NEXT: v_writelane_b32 v20, s31, 19
; VI-NEXT: v_readfirstlane_b32 s7, v5
; VI-NEXT: v_readfirstlane_b32 s9, v4
; VI-NEXT: v_readfirstlane_b32 s12, v3
; VI-NEXT: v_readfirstlane_b32 s15, v2
; VI-NEXT: v_readfirstlane_b32 s74, v1
; VI-NEXT: v_readfirstlane_b32 s77, v0
-; VI-NEXT: v_writelane_b32 v20, s31, 19
; VI-NEXT: s_lshr_b32 s11, s29, 16
; VI-NEXT: s_lshr_b32 s13, s28, 16
; VI-NEXT: s_lshr_b32 s72, s27, 16
@@ -5383,13 +5383,14 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s56, v5
; GFX9-NEXT: v_readfirstlane_b32 s58, v4
; GFX9-NEXT: v_readfirstlane_b32 s60, v3
; GFX9-NEXT: v_readfirstlane_b32 s62, v2
; GFX9-NEXT: v_readfirstlane_b32 s72, v1
; GFX9-NEXT: v_readfirstlane_b32 s74, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -5411,7 +5412,6 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s73, s72, 16
; GFX9-NEXT: s_lshr_b32 s75, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v6
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -6851,7 +6851,6 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v32, v19
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -6868,6 +6867,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v32, v19
; SI-NEXT: v_mov_b32_e32 v33, v18
; SI-NEXT: v_mov_b32_e32 v43, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -7426,7 +7426,6 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v40f16_to_v20i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -7443,6 +7442,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v18
; GFX9-NEXT: v_mov_b32_e32 v43, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -7801,13 +7801,15 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
+; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_writelane_b32 v32, s66, 14
+; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: v_readfirstlane_b32 s6, v5
; SI-NEXT: v_readfirstlane_b32 s8, v4
; SI-NEXT: v_readfirstlane_b32 s10, v3
; SI-NEXT: v_readfirstlane_b32 s12, v2
; SI-NEXT: v_readfirstlane_b32 s14, v1
; SI-NEXT: v_readfirstlane_b32 s73, v0
-; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: s_lshr_b32 s72, s29, 16
; SI-NEXT: s_lshr_b32 s75, s28, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -7829,9 +7831,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s15, s14, 16
; SI-NEXT: s_lshr_b32 s74, s73, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
-; SI-NEXT: v_writelane_b32 v32, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB19_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -8141,13 +8141,13 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s66, 16
; VI-NEXT: v_writelane_b32 v32, s67, 17
; VI-NEXT: v_writelane_b32 v32, s30, 18
+; VI-NEXT: v_writelane_b32 v32, s31, 19
; VI-NEXT: v_readfirstlane_b32 s6, v5
; VI-NEXT: v_readfirstlane_b32 s8, v4
; VI-NEXT: v_readfirstlane_b32 s11, v3
; VI-NEXT: v_readfirstlane_b32 s14, v2
; VI-NEXT: v_readfirstlane_b32 s73, v1
; VI-NEXT: v_readfirstlane_b32 s76, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 19
; VI-NEXT: s_lshr_b32 s10, s29, 16
; VI-NEXT: s_lshr_b32 s13, s28, 16
; VI-NEXT: s_lshr_b32 s72, s27, 16
@@ -8395,13 +8395,14 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s56, v5
; GFX9-NEXT: v_readfirstlane_b32 s58, v4
; GFX9-NEXT: v_readfirstlane_b32 s60, v3
; GFX9-NEXT: v_readfirstlane_b32 s62, v2
; GFX9-NEXT: v_readfirstlane_b32 s72, v1
; GFX9-NEXT: v_readfirstlane_b32 s74, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -8423,7 +8424,6 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s73, s72, 16
; GFX9-NEXT: s_lshr_b32 s75, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v6
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -8795,8 +8795,8 @@ define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -8914,8 +8914,8 @@ define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -9033,8 +9033,8 @@ define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -9142,40 +9142,40 @@ define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB21_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -9816,8 +9816,8 @@ define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -9935,8 +9935,8 @@ define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -10054,8 +10054,8 @@ define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -10163,40 +10163,40 @@ define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB25_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -10406,8 +10406,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -10515,8 +10515,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -10624,8 +10624,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -10723,40 +10723,40 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -12764,7 +12764,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v40i16_to_v20f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -12781,6 +12780,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v18
; GFX9-NEXT: v_mov_b32_e32 v43, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -13138,13 +13138,15 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v20, s54, 10
; SI-NEXT: v_writelane_b32 v20, s55, 11
; SI-NEXT: v_writelane_b32 v20, s64, 12
+; SI-NEXT: v_writelane_b32 v20, s65, 13
+; SI-NEXT: v_writelane_b32 v20, s66, 14
+; SI-NEXT: v_writelane_b32 v20, s67, 15
; SI-NEXT: v_readfirstlane_b32 s7, v5
; SI-NEXT: v_readfirstlane_b32 s9, v4
; SI-NEXT: v_readfirstlane_b32 s12, v3
; SI-NEXT: v_readfirstlane_b32 s15, v2
; SI-NEXT: v_readfirstlane_b32 s74, v1
; SI-NEXT: v_readfirstlane_b32 s77, v0
-; SI-NEXT: v_writelane_b32 v20, s65, 13
; SI-NEXT: s_lshr_b32 s11, s29, 16
; SI-NEXT: s_lshr_b32 s13, s28, 16
; SI-NEXT: s_lshr_b32 s72, s27, 16
@@ -13166,9 +13168,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; SI-NEXT: s_lshr_b32 s73, s74, 16
; SI-NEXT: s_lshr_b32 s76, s77, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
-; SI-NEXT: v_writelane_b32 v20, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v20, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB31_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -13404,13 +13404,13 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v20, s66, 16
; VI-NEXT: v_writelane_b32 v20, s67, 17
; VI-NEXT: v_writelane_b32 v20, s30, 18
+; VI-NEXT: v_writelane_b32 v20, s31, 19
; VI-NEXT: v_readfirstlane_b32 s7, v5
; VI-NEXT: v_readfirstlane_b32 s9, v4
; VI-NEXT: v_readfirstlane_b32 s12, v3
; VI-NEXT: v_readfirstlane_b32 s15, v2
; VI-NEXT: v_readfirstlane_b32 s74, v1
; VI-NEXT: v_readfirstlane_b32 s77, v0
-; VI-NEXT: v_writelane_b32 v20, s31, 19
; VI-NEXT: s_lshr_b32 s11, s29, 16
; VI-NEXT: s_lshr_b32 s13, s28, 16
; VI-NEXT: s_lshr_b32 s72, s27, 16
@@ -13663,13 +13663,14 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s56, v5
; GFX9-NEXT: v_readfirstlane_b32 s58, v4
; GFX9-NEXT: v_readfirstlane_b32 s60, v3
; GFX9-NEXT: v_readfirstlane_b32 s62, v2
; GFX9-NEXT: v_readfirstlane_b32 s72, v1
; GFX9-NEXT: v_readfirstlane_b32 s74, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -13691,7 +13692,6 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s73, s72, 16
; GFX9-NEXT: s_lshr_b32 s75, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v6
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -15349,7 +15349,6 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v32, v19
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -15366,6 +15365,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v32, v19
; SI-NEXT: v_mov_b32_e32 v33, v18
; SI-NEXT: v_mov_b32_e32 v43, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -15924,7 +15924,6 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v40f16_to_v20f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -15941,6 +15940,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v18
; GFX9-NEXT: v_mov_b32_e32 v43, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -16299,13 +16299,15 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
+; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_writelane_b32 v32, s66, 14
+; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: v_readfirstlane_b32 s6, v5
; SI-NEXT: v_readfirstlane_b32 s8, v4
; SI-NEXT: v_readfirstlane_b32 s10, v3
; SI-NEXT: v_readfirstlane_b32 s12, v2
; SI-NEXT: v_readfirstlane_b32 s14, v1
; SI-NEXT: v_readfirstlane_b32 s73, v0
-; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: s_lshr_b32 s72, s29, 16
; SI-NEXT: s_lshr_b32 s75, s28, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -16327,9 +16329,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: s_lshr_b32 s15, s14, 16
; SI-NEXT: s_lshr_b32 s74, s73, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
-; SI-NEXT: v_writelane_b32 v32, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB35_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -16639,13 +16639,13 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s66, 16
; VI-NEXT: v_writelane_b32 v32, s67, 17
; VI-NEXT: v_writelane_b32 v32, s30, 18
+; VI-NEXT: v_writelane_b32 v32, s31, 19
; VI-NEXT: v_readfirstlane_b32 s6, v5
; VI-NEXT: v_readfirstlane_b32 s8, v4
; VI-NEXT: v_readfirstlane_b32 s11, v3
; VI-NEXT: v_readfirstlane_b32 s14, v2
; VI-NEXT: v_readfirstlane_b32 s73, v1
; VI-NEXT: v_readfirstlane_b32 s76, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 19
; VI-NEXT: s_lshr_b32 s10, s29, 16
; VI-NEXT: s_lshr_b32 s13, s28, 16
; VI-NEXT: s_lshr_b32 s72, s27, 16
@@ -16893,13 +16893,14 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s56, v5
; GFX9-NEXT: v_readfirstlane_b32 s58, v4
; GFX9-NEXT: v_readfirstlane_b32 s60, v3
; GFX9-NEXT: v_readfirstlane_b32 s62, v2
; GFX9-NEXT: v_readfirstlane_b32 s72, v1
; GFX9-NEXT: v_readfirstlane_b32 s74, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -16921,7 +16922,6 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX9-NEXT: s_lshr_b32 s73, s72, 16
; GFX9-NEXT: s_lshr_b32 s75, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v6
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -17663,8 +17663,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -17772,8 +17772,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v6
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -17881,8 +17881,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v6
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -17980,40 +17980,40 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
+; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
-; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -19813,7 +19813,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v40i16_to_v10i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -19830,6 +19829,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v18
; GFX9-NEXT: v_mov_b32_e32 v43, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -20187,13 +20187,15 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v20, s54, 10
; SI-NEXT: v_writelane_b32 v20, s55, 11
; SI-NEXT: v_writelane_b32 v20, s64, 12
+; SI-NEXT: v_writelane_b32 v20, s65, 13
+; SI-NEXT: v_writelane_b32 v20, s66, 14
+; SI-NEXT: v_writelane_b32 v20, s67, 15
; SI-NEXT: v_readfirstlane_b32 s7, v5
; SI-NEXT: v_readfirstlane_b32 s9, v4
; SI-NEXT: v_readfirstlane_b32 s12, v3
; SI-NEXT: v_readfirstlane_b32 s15, v2
; SI-NEXT: v_readfirstlane_b32 s74, v1
; SI-NEXT: v_readfirstlane_b32 s77, v0
-; SI-NEXT: v_writelane_b32 v20, s65, 13
; SI-NEXT: s_lshr_b32 s11, s29, 16
; SI-NEXT: s_lshr_b32 s13, s28, 16
; SI-NEXT: s_lshr_b32 s72, s27, 16
@@ -20215,9 +20217,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; SI-NEXT: s_lshr_b32 s73, s74, 16
; SI-NEXT: s_lshr_b32 s76, s77, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
-; SI-NEXT: v_writelane_b32 v20, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v20, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB43_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -20453,13 +20453,13 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v20, s66, 16
; VI-NEXT: v_writelane_b32 v20, s67, 17
; VI-NEXT: v_writelane_b32 v20, s30, 18
+; VI-NEXT: v_writelane_b32 v20, s31, 19
; VI-NEXT: v_readfirstlane_b32 s7, v5
; VI-NEXT: v_readfirstlane_b32 s9, v4
; VI-NEXT: v_readfirstlane_b32 s12, v3
; VI-NEXT: v_readfirstlane_b32 s15, v2
; VI-NEXT: v_readfirstlane_b32 s74, v1
; VI-NEXT: v_readfirstlane_b32 s77, v0
-; VI-NEXT: v_writelane_b32 v20, s31, 19
; VI-NEXT: s_lshr_b32 s11, s29, 16
; VI-NEXT: s_lshr_b32 s13, s28, 16
; VI-NEXT: s_lshr_b32 s72, s27, 16
@@ -20712,13 +20712,14 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s56, v5
; GFX9-NEXT: v_readfirstlane_b32 s58, v4
; GFX9-NEXT: v_readfirstlane_b32 s60, v3
; GFX9-NEXT: v_readfirstlane_b32 s62, v2
; GFX9-NEXT: v_readfirstlane_b32 s72, v1
; GFX9-NEXT: v_readfirstlane_b32 s74, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -20740,7 +20741,6 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s73, s72, 16
; GFX9-NEXT: s_lshr_b32 s75, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v6
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -22190,7 +22190,6 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v32, v19
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -22207,6 +22206,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v32, v19
; SI-NEXT: v_mov_b32_e32 v33, v18
; SI-NEXT: v_mov_b32_e32 v43, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -22765,7 +22765,6 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v40f16_to_v10i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -22782,6 +22781,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v18
; GFX9-NEXT: v_mov_b32_e32 v43, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -23140,13 +23140,15 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
+; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_writelane_b32 v32, s66, 14
+; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: v_readfirstlane_b32 s6, v5
; SI-NEXT: v_readfirstlane_b32 s8, v4
; SI-NEXT: v_readfirstlane_b32 s10, v3
; SI-NEXT: v_readfirstlane_b32 s12, v2
; SI-NEXT: v_readfirstlane_b32 s14, v1
; SI-NEXT: v_readfirstlane_b32 s73, v0
-; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: s_lshr_b32 s72, s29, 16
; SI-NEXT: s_lshr_b32 s75, s28, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -23168,9 +23170,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s15, s14, 16
; SI-NEXT: s_lshr_b32 s74, s73, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
-; SI-NEXT: v_writelane_b32 v32, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB47_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -23480,13 +23480,13 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s66, 16
; VI-NEXT: v_writelane_b32 v32, s67, 17
; VI-NEXT: v_writelane_b32 v32, s30, 18
+; VI-NEXT: v_writelane_b32 v32, s31, 19
; VI-NEXT: v_readfirstlane_b32 s6, v5
; VI-NEXT: v_readfirstlane_b32 s8, v4
; VI-NEXT: v_readfirstlane_b32 s11, v3
; VI-NEXT: v_readfirstlane_b32 s14, v2
; VI-NEXT: v_readfirstlane_b32 s73, v1
; VI-NEXT: v_readfirstlane_b32 s76, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 19
; VI-NEXT: s_lshr_b32 s10, s29, 16
; VI-NEXT: s_lshr_b32 s13, s28, 16
; VI-NEXT: s_lshr_b32 s72, s27, 16
@@ -23734,13 +23734,14 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s56, v5
; GFX9-NEXT: v_readfirstlane_b32 s58, v4
; GFX9-NEXT: v_readfirstlane_b32 s60, v3
; GFX9-NEXT: v_readfirstlane_b32 s62, v2
; GFX9-NEXT: v_readfirstlane_b32 s72, v1
; GFX9-NEXT: v_readfirstlane_b32 s74, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -23762,7 +23763,6 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s73, s72, 16
; GFX9-NEXT: s_lshr_b32 s75, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v6
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -25832,7 +25832,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v40i16_to_v10f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -25849,6 +25848,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v18
; GFX9-NEXT: v_mov_b32_e32 v43, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -26206,13 +26206,15 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v20, s54, 10
; SI-NEXT: v_writelane_b32 v20, s55, 11
; SI-NEXT: v_writelane_b32 v20, s64, 12
+; SI-NEXT: v_writelane_b32 v20, s65, 13
+; SI-NEXT: v_writelane_b32 v20, s66, 14
+; SI-NEXT: v_writelane_b32 v20, s67, 15
; SI-NEXT: v_readfirstlane_b32 s7, v5
; SI-NEXT: v_readfirstlane_b32 s9, v4
; SI-NEXT: v_readfirstlane_b32 s12, v3
; SI-NEXT: v_readfirstlane_b32 s15, v2
; SI-NEXT: v_readfirstlane_b32 s74, v1
; SI-NEXT: v_readfirstlane_b32 s77, v0
-; SI-NEXT: v_writelane_b32 v20, s65, 13
; SI-NEXT: s_lshr_b32 s11, s29, 16
; SI-NEXT: s_lshr_b32 s13, s28, 16
; SI-NEXT: s_lshr_b32 s72, s27, 16
@@ -26234,9 +26236,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; SI-NEXT: s_lshr_b32 s73, s74, 16
; SI-NEXT: s_lshr_b32 s76, s77, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
-; SI-NEXT: v_writelane_b32 v20, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v20, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB51_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -26472,13 +26472,13 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v20, s66, 16
; VI-NEXT: v_writelane_b32 v20, s67, 17
; VI-NEXT: v_writelane_b32 v20, s30, 18
+; VI-NEXT: v_writelane_b32 v20, s31, 19
; VI-NEXT: v_readfirstlane_b32 s7, v5
; VI-NEXT: v_readfirstlane_b32 s9, v4
; VI-NEXT: v_readfirstlane_b32 s12, v3
; VI-NEXT: v_readfirstlane_b32 s15, v2
; VI-NEXT: v_readfirstlane_b32 s74, v1
; VI-NEXT: v_readfirstlane_b32 s77, v0
-; VI-NEXT: v_writelane_b32 v20, s31, 19
; VI-NEXT: s_lshr_b32 s11, s29, 16
; VI-NEXT: s_lshr_b32 s13, s28, 16
; VI-NEXT: s_lshr_b32 s72, s27, 16
@@ -26731,13 +26731,14 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s56, v5
; GFX9-NEXT: v_readfirstlane_b32 s58, v4
; GFX9-NEXT: v_readfirstlane_b32 s60, v3
; GFX9-NEXT: v_readfirstlane_b32 s62, v2
; GFX9-NEXT: v_readfirstlane_b32 s72, v1
; GFX9-NEXT: v_readfirstlane_b32 s74, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -26759,7 +26760,6 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s73, s72, 16
; GFX9-NEXT: s_lshr_b32 s75, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v6
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -28337,7 +28337,6 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v32, v19
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -28354,6 +28353,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v32, v19
; SI-NEXT: v_mov_b32_e32 v33, v18
; SI-NEXT: v_mov_b32_e32 v43, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -28912,7 +28912,6 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v40f16_to_v10f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -28929,6 +28928,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v18
; GFX9-NEXT: v_mov_b32_e32 v43, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -29287,13 +29287,15 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
+; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_writelane_b32 v32, s66, 14
+; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: v_readfirstlane_b32 s6, v5
; SI-NEXT: v_readfirstlane_b32 s8, v4
; SI-NEXT: v_readfirstlane_b32 s10, v3
; SI-NEXT: v_readfirstlane_b32 s12, v2
; SI-NEXT: v_readfirstlane_b32 s14, v1
; SI-NEXT: v_readfirstlane_b32 s73, v0
-; SI-NEXT: v_writelane_b32 v32, s65, 13
; SI-NEXT: s_lshr_b32 s72, s29, 16
; SI-NEXT: s_lshr_b32 s75, s28, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -29315,9 +29317,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: s_lshr_b32 s15, s14, 16
; SI-NEXT: s_lshr_b32 s74, s73, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
-; SI-NEXT: v_writelane_b32 v32, s66, 14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v32, s67, 15
; SI-NEXT: s_cbranch_scc0 .LBB55_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -29627,13 +29627,13 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; VI-NEXT: v_writelane_b32 v32, s66, 16
; VI-NEXT: v_writelane_b32 v32, s67, 17
; VI-NEXT: v_writelane_b32 v32, s30, 18
+; VI-NEXT: v_writelane_b32 v32, s31, 19
; VI-NEXT: v_readfirstlane_b32 s6, v5
; VI-NEXT: v_readfirstlane_b32 s8, v4
; VI-NEXT: v_readfirstlane_b32 s11, v3
; VI-NEXT: v_readfirstlane_b32 s14, v2
; VI-NEXT: v_readfirstlane_b32 s73, v1
; VI-NEXT: v_readfirstlane_b32 s76, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 19
; VI-NEXT: s_lshr_b32 s10, s29, 16
; VI-NEXT: s_lshr_b32 s13, s28, 16
; VI-NEXT: s_lshr_b32 s72, s27, 16
@@ -29881,13 +29881,14 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s56, v5
; GFX9-NEXT: v_readfirstlane_b32 s58, v4
; GFX9-NEXT: v_readfirstlane_b32 s60, v3
; GFX9-NEXT: v_readfirstlane_b32 s62, v2
; GFX9-NEXT: v_readfirstlane_b32 s72, v1
; GFX9-NEXT: v_readfirstlane_b32 s74, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -29909,7 +29910,6 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX9-NEXT: s_lshr_b32 s73, s72, 16
; GFX9-NEXT: s_lshr_b32 s75, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v6
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -30126,6 +30126,22 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; kill: killed $vgpr20
@@ -30176,22 +30192,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) #0 {
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0
; SI-NEXT: ; kill: killed $vgpr20
; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v48
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v39
@@ -31021,13 +31021,13 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v20, s70, 20
; SI-NEXT: v_writelane_b32 v20, s71, 21
; SI-NEXT: v_writelane_b32 v20, s30, 22
+; SI-NEXT: v_writelane_b32 v20, s31, 23
; SI-NEXT: v_readfirstlane_b32 s69, v5
; SI-NEXT: v_readfirstlane_b32 s71, v4
; SI-NEXT: v_readfirstlane_b32 s66, v3
; SI-NEXT: v_readfirstlane_b32 s68, v2
; SI-NEXT: v_readfirstlane_b32 s55, v1
; SI-NEXT: v_readfirstlane_b32 s65, v0
-; SI-NEXT: v_writelane_b32 v20, s31, 23
; SI-NEXT: s_lshr_b32 s36, s29, 16
; SI-NEXT: s_lshr_b32 s54, s28, 16
; SI-NEXT: s_lshr_b32 s35, s27, 16
@@ -32708,6 +32708,18 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; SI-LABEL: bitcast_v40f16_to_v40i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s9, v5
; SI-NEXT: v_readfirstlane_b32 s6, v4
; SI-NEXT: v_readfirstlane_b32 s10, v3
@@ -32736,18 +32748,6 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s56, s8, 16
; SI-NEXT: v_readfirstlane_b32 s4, v6
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB59_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB59_4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index f1c80ed5d2873..df100725d890c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -599,8 +599,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -722,8 +722,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -845,8 +845,8 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -958,42 +958,42 @@ define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -2496,8 +2496,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -2608,8 +2608,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -2720,8 +2720,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -2822,42 +2822,42 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -4817,7 +4817,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v44i16_to_v22i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -4834,6 +4833,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: v_mov_b32_e32 v33, v20
; GFX9-NEXT: v_mov_b32_e32 v45, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -5239,6 +5239,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v22, s66, 16
; SI-NEXT: v_writelane_b32 v22, s67, 17
; SI-NEXT: v_writelane_b32 v22, s30, 18
+; SI-NEXT: v_writelane_b32 v22, s31, 19
; SI-NEXT: v_readfirstlane_b32 s7, v7
; SI-NEXT: v_readfirstlane_b32 s9, v6
; SI-NEXT: v_readfirstlane_b32 s11, v5
@@ -5247,7 +5248,6 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s75, v2
; SI-NEXT: v_readfirstlane_b32 s79, v1
; SI-NEXT: v_readfirstlane_b32 s90, v0
-; SI-NEXT: v_writelane_b32 v22, s31, 19
; SI-NEXT: s_lshr_b32 s14, s29, 16
; SI-NEXT: s_lshr_b32 s73, s28, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -5533,6 +5533,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v22, s70, 20
; VI-NEXT: v_writelane_b32 v22, s71, 21
; VI-NEXT: v_writelane_b32 v22, s30, 22
+; VI-NEXT: v_writelane_b32 v22, s31, 23
; VI-NEXT: v_readfirstlane_b32 s7, v7
; VI-NEXT: v_readfirstlane_b32 s9, v6
; VI-NEXT: v_readfirstlane_b32 s11, v5
@@ -5541,7 +5542,6 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s75, v2
; VI-NEXT: v_readfirstlane_b32 s79, v1
; VI-NEXT: v_readfirstlane_b32 s90, v0
-; VI-NEXT: v_writelane_b32 v22, s31, 23
; VI-NEXT: s_lshr_b32 s14, s29, 16
; VI-NEXT: s_lshr_b32 s73, s28, 16
; VI-NEXT: s_lshr_b32 s76, s27, 16
@@ -5818,6 +5818,8 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s57, v7
; GFX9-NEXT: v_readfirstlane_b32 s56, v6
; GFX9-NEXT: v_readfirstlane_b32 s60, v5
@@ -5826,7 +5828,6 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s76, v1
; GFX9-NEXT: v_readfirstlane_b32 s78, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -5850,7 +5851,6 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s77, s76, 16
; GFX9-NEXT: s_lshr_b32 s79, s78, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v8
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -8060,7 +8060,6 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v44f16_to_v22i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -8077,6 +8076,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: v_mov_b32_e32 v33, v20
; GFX9-NEXT: v_mov_b32_e32 v45, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -8483,6 +8483,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s66, 16
; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_writelane_b32 v32, s30, 18
+; SI-NEXT: v_writelane_b32 v32, s31, 19
; SI-NEXT: v_readfirstlane_b32 s6, v7
; SI-NEXT: v_readfirstlane_b32 s8, v6
; SI-NEXT: v_readfirstlane_b32 s10, v5
@@ -8491,7 +8492,6 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s72, v2
; SI-NEXT: v_readfirstlane_b32 s74, v1
; SI-NEXT: v_readfirstlane_b32 s77, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 19
; SI-NEXT: s_lshr_b32 s75, s29, 16
; SI-NEXT: s_lshr_b32 s78, s28, 16
; SI-NEXT: s_lshr_b32 s88, s27, 16
@@ -8855,6 +8855,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s70, 20
; VI-NEXT: v_writelane_b32 v32, s71, 21
; VI-NEXT: v_writelane_b32 v32, s30, 22
+; VI-NEXT: v_writelane_b32 v32, s31, 23
; VI-NEXT: v_readfirstlane_b32 s6, v7
; VI-NEXT: v_readfirstlane_b32 s8, v6
; VI-NEXT: v_readfirstlane_b32 s10, v5
@@ -8863,7 +8864,6 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s75, v2
; VI-NEXT: v_readfirstlane_b32 s78, v1
; VI-NEXT: v_readfirstlane_b32 s89, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 23
; VI-NEXT: s_lshr_b32 s13, s29, 16
; VI-NEXT: s_lshr_b32 s72, s28, 16
; VI-NEXT: s_lshr_b32 s74, s27, 16
@@ -9131,6 +9131,8 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s57, v7
; GFX9-NEXT: v_readfirstlane_b32 s56, v6
; GFX9-NEXT: v_readfirstlane_b32 s60, v5
@@ -9139,7 +9141,6 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s76, v1
; GFX9-NEXT: v_readfirstlane_b32 s78, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -9163,7 +9164,6 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s77, s76, 16
; GFX9-NEXT: s_lshr_b32 s79, s78, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v8
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -9554,8 +9554,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -9677,8 +9677,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -9800,8 +9800,8 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -9913,42 +9913,42 @@ define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB21_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -10630,8 +10630,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -10753,8 +10753,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -10876,8 +10876,8 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -10989,42 +10989,42 @@ define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB25_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -11240,8 +11240,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -11352,8 +11352,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -11464,8 +11464,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -11566,42 +11566,42 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -13800,7 +13800,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v44i16_to_v22f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -13817,6 +13816,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: v_mov_b32_e32 v33, v20
; GFX9-NEXT: v_mov_b32_e32 v45, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -14222,6 +14222,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v22, s66, 16
; SI-NEXT: v_writelane_b32 v22, s67, 17
; SI-NEXT: v_writelane_b32 v22, s30, 18
+; SI-NEXT: v_writelane_b32 v22, s31, 19
; SI-NEXT: v_readfirstlane_b32 s7, v7
; SI-NEXT: v_readfirstlane_b32 s9, v6
; SI-NEXT: v_readfirstlane_b32 s11, v5
@@ -14230,7 +14231,6 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s75, v2
; SI-NEXT: v_readfirstlane_b32 s79, v1
; SI-NEXT: v_readfirstlane_b32 s90, v0
-; SI-NEXT: v_writelane_b32 v22, s31, 19
; SI-NEXT: s_lshr_b32 s14, s29, 16
; SI-NEXT: s_lshr_b32 s73, s28, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -14516,6 +14516,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v22, s70, 20
; VI-NEXT: v_writelane_b32 v22, s71, 21
; VI-NEXT: v_writelane_b32 v22, s30, 22
+; VI-NEXT: v_writelane_b32 v22, s31, 23
; VI-NEXT: v_readfirstlane_b32 s7, v7
; VI-NEXT: v_readfirstlane_b32 s9, v6
; VI-NEXT: v_readfirstlane_b32 s11, v5
@@ -14524,7 +14525,6 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s75, v2
; VI-NEXT: v_readfirstlane_b32 s79, v1
; VI-NEXT: v_readfirstlane_b32 s90, v0
-; VI-NEXT: v_writelane_b32 v22, s31, 23
; VI-NEXT: s_lshr_b32 s14, s29, 16
; VI-NEXT: s_lshr_b32 s73, s28, 16
; VI-NEXT: s_lshr_b32 s76, s27, 16
@@ -14801,6 +14801,8 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s57, v7
; GFX9-NEXT: v_readfirstlane_b32 s56, v6
; GFX9-NEXT: v_readfirstlane_b32 s60, v5
@@ -14809,7 +14811,6 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s76, v1
; GFX9-NEXT: v_readfirstlane_b32 s78, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -14833,7 +14834,6 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s77, s76, 16
; GFX9-NEXT: s_lshr_b32 s79, s78, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v8
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -17282,7 +17282,6 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v44f16_to_v22f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -17299,6 +17298,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: v_mov_b32_e32 v33, v20
; GFX9-NEXT: v_mov_b32_e32 v45, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -17705,6 +17705,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s66, 16
; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_writelane_b32 v32, s30, 18
+; SI-NEXT: v_writelane_b32 v32, s31, 19
; SI-NEXT: v_readfirstlane_b32 s6, v7
; SI-NEXT: v_readfirstlane_b32 s8, v6
; SI-NEXT: v_readfirstlane_b32 s10, v5
@@ -17713,7 +17714,6 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s72, v2
; SI-NEXT: v_readfirstlane_b32 s74, v1
; SI-NEXT: v_readfirstlane_b32 s77, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 19
; SI-NEXT: s_lshr_b32 s75, s29, 16
; SI-NEXT: s_lshr_b32 s78, s28, 16
; SI-NEXT: s_lshr_b32 s88, s27, 16
@@ -18077,6 +18077,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s70, 20
; VI-NEXT: v_writelane_b32 v32, s71, 21
; VI-NEXT: v_writelane_b32 v32, s30, 22
+; VI-NEXT: v_writelane_b32 v32, s31, 23
; VI-NEXT: v_readfirstlane_b32 s6, v7
; VI-NEXT: v_readfirstlane_b32 s8, v6
; VI-NEXT: v_readfirstlane_b32 s10, v5
@@ -18085,7 +18086,6 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s75, v2
; VI-NEXT: v_readfirstlane_b32 s78, v1
; VI-NEXT: v_readfirstlane_b32 s89, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 23
; VI-NEXT: s_lshr_b32 s13, s29, 16
; VI-NEXT: s_lshr_b32 s72, s28, 16
; VI-NEXT: s_lshr_b32 s74, s27, 16
@@ -18353,6 +18353,8 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s57, v7
; GFX9-NEXT: v_readfirstlane_b32 s56, v6
; GFX9-NEXT: v_readfirstlane_b32 s60, v5
@@ -18361,7 +18363,6 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s76, v1
; GFX9-NEXT: v_readfirstlane_b32 s78, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -18385,7 +18386,6 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX9-NEXT: s_lshr_b32 s77, s76, 16
; GFX9-NEXT: s_lshr_b32 s79, s78, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v8
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -19175,8 +19175,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -19287,8 +19287,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v8
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -19399,8 +19399,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v8
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -19501,42 +19501,42 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -21508,7 +21508,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v44i16_to_v11i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -21525,6 +21524,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: v_mov_b32_e32 v33, v20
; GFX9-NEXT: v_mov_b32_e32 v45, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -21930,6 +21930,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v22, s66, 16
; SI-NEXT: v_writelane_b32 v22, s67, 17
; SI-NEXT: v_writelane_b32 v22, s30, 18
+; SI-NEXT: v_writelane_b32 v22, s31, 19
; SI-NEXT: v_readfirstlane_b32 s7, v7
; SI-NEXT: v_readfirstlane_b32 s9, v6
; SI-NEXT: v_readfirstlane_b32 s11, v5
@@ -21938,7 +21939,6 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s75, v2
; SI-NEXT: v_readfirstlane_b32 s79, v1
; SI-NEXT: v_readfirstlane_b32 s90, v0
-; SI-NEXT: v_writelane_b32 v22, s31, 19
; SI-NEXT: s_lshr_b32 s14, s29, 16
; SI-NEXT: s_lshr_b32 s73, s28, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -22224,6 +22224,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v22, s70, 20
; VI-NEXT: v_writelane_b32 v22, s71, 21
; VI-NEXT: v_writelane_b32 v22, s30, 22
+; VI-NEXT: v_writelane_b32 v22, s31, 23
; VI-NEXT: v_readfirstlane_b32 s7, v7
; VI-NEXT: v_readfirstlane_b32 s9, v6
; VI-NEXT: v_readfirstlane_b32 s11, v5
@@ -22232,7 +22233,6 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s75, v2
; VI-NEXT: v_readfirstlane_b32 s79, v1
; VI-NEXT: v_readfirstlane_b32 s90, v0
-; VI-NEXT: v_writelane_b32 v22, s31, 23
; VI-NEXT: s_lshr_b32 s14, s29, 16
; VI-NEXT: s_lshr_b32 s73, s28, 16
; VI-NEXT: s_lshr_b32 s76, s27, 16
@@ -22509,6 +22509,8 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s57, v7
; GFX9-NEXT: v_readfirstlane_b32 s56, v6
; GFX9-NEXT: v_readfirstlane_b32 s60, v5
@@ -22517,7 +22519,6 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s76, v1
; GFX9-NEXT: v_readfirstlane_b32 s78, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -22541,7 +22542,6 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s77, s76, 16
; GFX9-NEXT: s_lshr_b32 s79, s78, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v8
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -24763,7 +24763,6 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v44f16_to_v11i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -24780,6 +24779,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: v_mov_b32_e32 v33, v20
; GFX9-NEXT: v_mov_b32_e32 v45, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -25186,6 +25186,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s66, 16
; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_writelane_b32 v32, s30, 18
+; SI-NEXT: v_writelane_b32 v32, s31, 19
; SI-NEXT: v_readfirstlane_b32 s6, v7
; SI-NEXT: v_readfirstlane_b32 s8, v6
; SI-NEXT: v_readfirstlane_b32 s10, v5
@@ -25194,7 +25195,6 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s72, v2
; SI-NEXT: v_readfirstlane_b32 s74, v1
; SI-NEXT: v_readfirstlane_b32 s77, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 19
; SI-NEXT: s_lshr_b32 s75, s29, 16
; SI-NEXT: s_lshr_b32 s78, s28, 16
; SI-NEXT: s_lshr_b32 s88, s27, 16
@@ -25558,6 +25558,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s70, 20
; VI-NEXT: v_writelane_b32 v32, s71, 21
; VI-NEXT: v_writelane_b32 v32, s30, 22
+; VI-NEXT: v_writelane_b32 v32, s31, 23
; VI-NEXT: v_readfirstlane_b32 s6, v7
; VI-NEXT: v_readfirstlane_b32 s8, v6
; VI-NEXT: v_readfirstlane_b32 s10, v5
@@ -25566,7 +25567,6 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s75, v2
; VI-NEXT: v_readfirstlane_b32 s78, v1
; VI-NEXT: v_readfirstlane_b32 s89, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 23
; VI-NEXT: s_lshr_b32 s13, s29, 16
; VI-NEXT: s_lshr_b32 s72, s28, 16
; VI-NEXT: s_lshr_b32 s74, s27, 16
@@ -25834,6 +25834,8 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s57, v7
; GFX9-NEXT: v_readfirstlane_b32 s56, v6
; GFX9-NEXT: v_readfirstlane_b32 s60, v5
@@ -25842,7 +25844,6 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s76, v1
; GFX9-NEXT: v_readfirstlane_b32 s78, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -25866,7 +25867,6 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s77, s76, 16
; GFX9-NEXT: s_lshr_b32 s79, s78, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v8
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -28130,7 +28130,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v44i16_to_v11f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -28147,6 +28146,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: v_mov_b32_e32 v33, v20
; GFX9-NEXT: v_mov_b32_e32 v45, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -28552,6 +28552,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v22, s66, 16
; SI-NEXT: v_writelane_b32 v22, s67, 17
; SI-NEXT: v_writelane_b32 v22, s30, 18
+; SI-NEXT: v_writelane_b32 v22, s31, 19
; SI-NEXT: v_readfirstlane_b32 s7, v7
; SI-NEXT: v_readfirstlane_b32 s9, v6
; SI-NEXT: v_readfirstlane_b32 s11, v5
@@ -28560,7 +28561,6 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s75, v2
; SI-NEXT: v_readfirstlane_b32 s79, v1
; SI-NEXT: v_readfirstlane_b32 s90, v0
-; SI-NEXT: v_writelane_b32 v22, s31, 19
; SI-NEXT: s_lshr_b32 s14, s29, 16
; SI-NEXT: s_lshr_b32 s73, s28, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -28846,6 +28846,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v22, s70, 20
; VI-NEXT: v_writelane_b32 v22, s71, 21
; VI-NEXT: v_writelane_b32 v22, s30, 22
+; VI-NEXT: v_writelane_b32 v22, s31, 23
; VI-NEXT: v_readfirstlane_b32 s7, v7
; VI-NEXT: v_readfirstlane_b32 s9, v6
; VI-NEXT: v_readfirstlane_b32 s11, v5
@@ -28854,7 +28855,6 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s75, v2
; VI-NEXT: v_readfirstlane_b32 s79, v1
; VI-NEXT: v_readfirstlane_b32 s90, v0
-; VI-NEXT: v_writelane_b32 v22, s31, 23
; VI-NEXT: s_lshr_b32 s14, s29, 16
; VI-NEXT: s_lshr_b32 s73, s28, 16
; VI-NEXT: s_lshr_b32 s76, s27, 16
@@ -29131,6 +29131,8 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s57, v7
; GFX9-NEXT: v_readfirstlane_b32 s56, v6
; GFX9-NEXT: v_readfirstlane_b32 s60, v5
@@ -29139,7 +29141,6 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s76, v1
; GFX9-NEXT: v_readfirstlane_b32 s78, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -29163,7 +29164,6 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s77, s76, 16
; GFX9-NEXT: s_lshr_b32 s79, s78, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v8
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -31524,7 +31524,6 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v44f16_to_v11f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -31541,6 +31540,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v32, v21
; GFX9-NEXT: v_mov_b32_e32 v33, v20
; GFX9-NEXT: v_mov_b32_e32 v45, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32
@@ -31947,6 +31947,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: v_writelane_b32 v32, s66, 16
; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_writelane_b32 v32, s30, 18
+; SI-NEXT: v_writelane_b32 v32, s31, 19
; SI-NEXT: v_readfirstlane_b32 s6, v7
; SI-NEXT: v_readfirstlane_b32 s8, v6
; SI-NEXT: v_readfirstlane_b32 s10, v5
@@ -31955,7 +31956,6 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: v_readfirstlane_b32 s72, v2
; SI-NEXT: v_readfirstlane_b32 s74, v1
; SI-NEXT: v_readfirstlane_b32 s77, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 19
; SI-NEXT: s_lshr_b32 s75, s29, 16
; SI-NEXT: s_lshr_b32 s78, s28, 16
; SI-NEXT: s_lshr_b32 s88, s27, 16
@@ -32319,6 +32319,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; VI-NEXT: v_writelane_b32 v32, s70, 20
; VI-NEXT: v_writelane_b32 v32, s71, 21
; VI-NEXT: v_writelane_b32 v32, s30, 22
+; VI-NEXT: v_writelane_b32 v32, s31, 23
; VI-NEXT: v_readfirstlane_b32 s6, v7
; VI-NEXT: v_readfirstlane_b32 s8, v6
; VI-NEXT: v_readfirstlane_b32 s10, v5
@@ -32327,7 +32328,6 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; VI-NEXT: v_readfirstlane_b32 s75, v2
; VI-NEXT: v_readfirstlane_b32 s78, v1
; VI-NEXT: v_readfirstlane_b32 s89, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 23
; VI-NEXT: s_lshr_b32 s13, s29, 16
; VI-NEXT: s_lshr_b32 s72, s28, 16
; VI-NEXT: s_lshr_b32 s74, s27, 16
@@ -32595,6 +32595,8 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s57, v7
; GFX9-NEXT: v_readfirstlane_b32 s56, v6
; GFX9-NEXT: v_readfirstlane_b32 s60, v5
@@ -32603,7 +32605,6 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s74, v2
; GFX9-NEXT: v_readfirstlane_b32 s76, v1
; GFX9-NEXT: v_readfirstlane_b32 s78, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -32627,7 +32628,6 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX9-NEXT: s_lshr_b32 s77, s76, 16
; GFX9-NEXT: s_lshr_b32 s79, s78, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v8
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
@@ -32856,6 +32856,22 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v44f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
@@ -32925,25 +32941,8 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0
; SI-NEXT: ; kill: killed $vgpr22
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v52
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v50
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v37
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v32
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34
@@ -32956,7 +32955,6 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v26
; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v35
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v24
@@ -32982,6 +32980,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
; SI-NEXT: s_cbranch_execz .LBB56_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51
; SI-NEXT: v_or_b32_e32 v42, v1, v22
; SI-NEXT: v_alignbit_b32 v1, v42, v46, 16
@@ -33141,6 +33140,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
; SI-NEXT: s_mov_b32 s6, 0x30000
; SI-NEXT: v_or_b32_e32 v20, v60, v20
; SI-NEXT: v_or_b32_e32 v18, v58, v18
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20
; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19
@@ -33209,7 +33209,6 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -33871,6 +33870,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v22, s84, 26
; SI-NEXT: v_writelane_b32 v22, s85, 27
; SI-NEXT: v_writelane_b32 v22, s30, 28
+; SI-NEXT: v_writelane_b32 v22, s31, 29
; SI-NEXT: v_readfirstlane_b32 s83, v7
; SI-NEXT: v_readfirstlane_b32 s85, v6
; SI-NEXT: v_readfirstlane_b32 s80, v5
@@ -33879,7 +33879,6 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s71, v2
; SI-NEXT: v_readfirstlane_b32 s66, v1
; SI-NEXT: v_readfirstlane_b32 s68, v0
-; SI-NEXT: v_writelane_b32 v22, s31, 29
; SI-NEXT: s_lshr_b32 s38, s29, 16
; SI-NEXT: s_lshr_b32 s65, s28, 16
; SI-NEXT: s_lshr_b32 s37, s27, 16
@@ -35719,6 +35718,22 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; SI-LABEL: bitcast_v44f16_to_v44i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s6, v7
; SI-NEXT: v_readfirstlane_b32 s7, v6
; SI-NEXT: v_readfirstlane_b32 s42, v5
@@ -35751,22 +35766,6 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s61, s63, 16
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB59_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB59_4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index 1194fa2305563..d0eac628194ee 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -637,8 +637,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -764,8 +764,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -891,8 +891,8 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -1008,44 +1008,44 @@ define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -2647,8 +2647,8 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -2762,8 +2762,8 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -2877,8 +2877,8 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -2982,44 +2982,44 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -3776,8 +3776,9 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v24, s34, 0
; SI-NEXT: v_writelane_b32 v24, s35, 1
-; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_writelane_b32 v24, s30, 2
+; SI-NEXT: v_writelane_b32 v24, s31, 3
+; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s5, v9
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_readfirstlane_b32 s7, v7
@@ -3789,7 +3790,6 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: s_cmp_lg_u32 s12, 0
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v24, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s88, s5, 16
@@ -5632,6 +5632,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v24, s70, 20
; SI-NEXT: v_writelane_b32 v24, s71, 21
; SI-NEXT: v_writelane_b32 v24, s30, 22
+; SI-NEXT: v_writelane_b32 v24, s31, 23
; SI-NEXT: v_readfirstlane_b32 s7, v9
; SI-NEXT: v_readfirstlane_b32 s9, v8
; SI-NEXT: v_readfirstlane_b32 s11, v7
@@ -5642,7 +5643,6 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s88, v2
; SI-NEXT: v_readfirstlane_b32 s91, v1
; SI-NEXT: v_readfirstlane_b32 s94, v0
-; SI-NEXT: v_writelane_b32 v24, s31, 23
; SI-NEXT: s_lshr_b32 s72, s29, 16
; SI-NEXT: s_lshr_b32 s75, s28, 16
; SI-NEXT: s_lshr_b32 s78, s27, 16
@@ -5956,6 +5956,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v24, s82, 24
; VI-NEXT: v_writelane_b32 v24, s83, 25
; VI-NEXT: v_writelane_b32 v24, s30, 26
+; VI-NEXT: v_writelane_b32 v24, s31, 27
; VI-NEXT: v_readfirstlane_b32 s7, v9
; VI-NEXT: v_readfirstlane_b32 s9, v8
; VI-NEXT: v_readfirstlane_b32 s11, v7
@@ -5966,7 +5967,6 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s88, v2
; VI-NEXT: v_readfirstlane_b32 s91, v1
; VI-NEXT: v_readfirstlane_b32 s34, v0
-; VI-NEXT: v_writelane_b32 v24, s31, 27
; VI-NEXT: s_lshr_b32 s72, s29, 16
; VI-NEXT: s_lshr_b32 s75, s28, 16
; VI-NEXT: s_lshr_b32 s78, s27, 16
@@ -6266,8 +6266,10 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s59, v9
; GFX9-NEXT: v_readfirstlane_b32 s58, v8
@@ -6280,7 +6282,6 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s88, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -6305,7 +6306,6 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
; GFX9-NEXT: s_lshr_b32 s17, s16, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s38, s18, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
@@ -7223,8 +7223,9 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v24, s34, 0
; SI-NEXT: v_writelane_b32 v24, s35, 1
-; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_writelane_b32 v24, s30, 2
+; SI-NEXT: v_writelane_b32 v24, s31, 3
+; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s5, v9
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_readfirstlane_b32 s7, v7
@@ -7236,7 +7237,6 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: s_cmp_lg_u32 s12, 0
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v24, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s88, s5, 16
@@ -9191,6 +9191,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s70, 20
; SI-NEXT: v_writelane_b32 v32, s71, 21
; SI-NEXT: v_writelane_b32 v32, s30, 22
+; SI-NEXT: v_writelane_b32 v32, s31, 23
; SI-NEXT: v_readfirstlane_b32 s6, v9
; SI-NEXT: v_readfirstlane_b32 s8, v8
; SI-NEXT: v_readfirstlane_b32 s10, v7
@@ -9201,7 +9202,6 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s76, v2
; SI-NEXT: v_readfirstlane_b32 s79, v1
; SI-NEXT: v_readfirstlane_b32 s89, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 23
; SI-NEXT: s_lshr_b32 s78, s29, 16
; SI-NEXT: s_lshr_b32 s90, s28, 16
; SI-NEXT: s_lshr_b32 s92, s27, 16
@@ -9597,6 +9597,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_writelane_b32 v32, s83, 25
; VI-NEXT: v_writelane_b32 v32, s30, 26
+; VI-NEXT: v_writelane_b32 v32, s31, 27
; VI-NEXT: v_readfirstlane_b32 s6, v9
; VI-NEXT: v_readfirstlane_b32 s8, v8
; VI-NEXT: v_readfirstlane_b32 s10, v7
@@ -9607,7 +9608,6 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s79, v2
; VI-NEXT: v_readfirstlane_b32 s91, v1
; VI-NEXT: v_readfirstlane_b32 s34, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 27
; VI-NEXT: s_lshr_b32 s72, s29, 16
; VI-NEXT: s_lshr_b32 s74, s28, 16
; VI-NEXT: s_lshr_b32 s77, s27, 16
@@ -9894,8 +9894,10 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s59, v9
; GFX9-NEXT: v_readfirstlane_b32 s58, v8
@@ -9908,7 +9910,6 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s88, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -9933,7 +9934,6 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
; GFX9-NEXT: s_lshr_b32 s17, s16, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s38, s18, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
@@ -10341,8 +10341,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -10468,8 +10468,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -10595,8 +10595,8 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -10712,44 +10712,44 @@ define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB21_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -11471,8 +11471,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -11598,8 +11598,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -11725,8 +11725,8 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -11842,44 +11842,44 @@ define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB25_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -12101,8 +12101,8 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -12216,8 +12216,8 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -12331,8 +12331,8 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -12436,44 +12436,44 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -13206,8 +13206,9 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v40, s34, 0
; SI-NEXT: v_writelane_b32 v40, s35, 1
-; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_writelane_b32 v40, s30, 2
+; SI-NEXT: v_writelane_b32 v40, s31, 3
+; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s5, v9
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_readfirstlane_b32 s7, v7
@@ -13219,7 +13220,6 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: s_cmp_lg_u32 s12, 0
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v40, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB29_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s35, s5, 16
@@ -15346,6 +15346,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v24, s70, 20
; SI-NEXT: v_writelane_b32 v24, s71, 21
; SI-NEXT: v_writelane_b32 v24, s30, 22
+; SI-NEXT: v_writelane_b32 v24, s31, 23
; SI-NEXT: v_readfirstlane_b32 s7, v9
; SI-NEXT: v_readfirstlane_b32 s9, v8
; SI-NEXT: v_readfirstlane_b32 s11, v7
@@ -15356,7 +15357,6 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s88, v2
; SI-NEXT: v_readfirstlane_b32 s91, v1
; SI-NEXT: v_readfirstlane_b32 s94, v0
-; SI-NEXT: v_writelane_b32 v24, s31, 23
; SI-NEXT: s_lshr_b32 s72, s29, 16
; SI-NEXT: s_lshr_b32 s75, s28, 16
; SI-NEXT: s_lshr_b32 s78, s27, 16
@@ -15670,6 +15670,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v24, s82, 24
; VI-NEXT: v_writelane_b32 v24, s83, 25
; VI-NEXT: v_writelane_b32 v24, s30, 26
+; VI-NEXT: v_writelane_b32 v24, s31, 27
; VI-NEXT: v_readfirstlane_b32 s7, v9
; VI-NEXT: v_readfirstlane_b32 s9, v8
; VI-NEXT: v_readfirstlane_b32 s11, v7
@@ -15680,7 +15681,6 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s88, v2
; VI-NEXT: v_readfirstlane_b32 s91, v1
; VI-NEXT: v_readfirstlane_b32 s34, v0
-; VI-NEXT: v_writelane_b32 v24, s31, 27
; VI-NEXT: s_lshr_b32 s72, s29, 16
; VI-NEXT: s_lshr_b32 s75, s28, 16
; VI-NEXT: s_lshr_b32 s78, s27, 16
@@ -15980,8 +15980,10 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s59, v9
; GFX9-NEXT: v_readfirstlane_b32 s58, v8
@@ -15994,7 +15996,6 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s88, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -16019,7 +16020,6 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
; GFX9-NEXT: s_lshr_b32 s17, s16, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s38, s18, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
@@ -16913,8 +16913,9 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v40, s34, 0
; SI-NEXT: v_writelane_b32 v40, s35, 1
-; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_writelane_b32 v40, s30, 2
+; SI-NEXT: v_writelane_b32 v40, s31, 3
+; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s5, v9
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_readfirstlane_b32 s7, v7
@@ -16926,7 +16927,6 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: s_cmp_lg_u32 s12, 0
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v40, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB33_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s35, s5, 16
@@ -19165,6 +19165,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s70, 20
; SI-NEXT: v_writelane_b32 v32, s71, 21
; SI-NEXT: v_writelane_b32 v32, s30, 22
+; SI-NEXT: v_writelane_b32 v32, s31, 23
; SI-NEXT: v_readfirstlane_b32 s6, v9
; SI-NEXT: v_readfirstlane_b32 s8, v8
; SI-NEXT: v_readfirstlane_b32 s10, v7
@@ -19175,7 +19176,6 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s76, v2
; SI-NEXT: v_readfirstlane_b32 s79, v1
; SI-NEXT: v_readfirstlane_b32 s89, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 23
; SI-NEXT: s_lshr_b32 s78, s29, 16
; SI-NEXT: s_lshr_b32 s90, s28, 16
; SI-NEXT: s_lshr_b32 s92, s27, 16
@@ -19571,6 +19571,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_writelane_b32 v32, s83, 25
; VI-NEXT: v_writelane_b32 v32, s30, 26
+; VI-NEXT: v_writelane_b32 v32, s31, 27
; VI-NEXT: v_readfirstlane_b32 s6, v9
; VI-NEXT: v_readfirstlane_b32 s8, v8
; VI-NEXT: v_readfirstlane_b32 s10, v7
@@ -19581,7 +19582,6 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s79, v2
; VI-NEXT: v_readfirstlane_b32 s91, v1
; VI-NEXT: v_readfirstlane_b32 s34, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 27
; VI-NEXT: s_lshr_b32 s72, s29, 16
; VI-NEXT: s_lshr_b32 s74, s28, 16
; VI-NEXT: s_lshr_b32 s77, s27, 16
@@ -19868,8 +19868,10 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s59, v9
; GFX9-NEXT: v_readfirstlane_b32 s58, v8
@@ -19882,7 +19884,6 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s88, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -19907,7 +19908,6 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
; GFX9-NEXT: s_lshr_b32 s17, s16, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s38, s18, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
@@ -20742,8 +20742,8 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -20857,8 +20857,8 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v10
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -20972,8 +20972,8 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v10
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -21077,44 +21077,44 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -21883,8 +21883,9 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v24, s34, 0
; SI-NEXT: v_writelane_b32 v24, s35, 1
-; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_writelane_b32 v24, s30, 2
+; SI-NEXT: v_writelane_b32 v24, s31, 3
+; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s5, v9
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_readfirstlane_b32 s7, v7
@@ -21896,7 +21897,6 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: s_cmp_lg_u32 s12, 0
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v24, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s88, s5, 16
@@ -23739,6 +23739,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v24, s70, 20
; SI-NEXT: v_writelane_b32 v24, s71, 21
; SI-NEXT: v_writelane_b32 v24, s30, 22
+; SI-NEXT: v_writelane_b32 v24, s31, 23
; SI-NEXT: v_readfirstlane_b32 s7, v9
; SI-NEXT: v_readfirstlane_b32 s9, v8
; SI-NEXT: v_readfirstlane_b32 s11, v7
@@ -23749,7 +23750,6 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s88, v2
; SI-NEXT: v_readfirstlane_b32 s91, v1
; SI-NEXT: v_readfirstlane_b32 s94, v0
-; SI-NEXT: v_writelane_b32 v24, s31, 23
; SI-NEXT: s_lshr_b32 s72, s29, 16
; SI-NEXT: s_lshr_b32 s75, s28, 16
; SI-NEXT: s_lshr_b32 s78, s27, 16
@@ -24063,6 +24063,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v24, s82, 24
; VI-NEXT: v_writelane_b32 v24, s83, 25
; VI-NEXT: v_writelane_b32 v24, s30, 26
+; VI-NEXT: v_writelane_b32 v24, s31, 27
; VI-NEXT: v_readfirstlane_b32 s7, v9
; VI-NEXT: v_readfirstlane_b32 s9, v8
; VI-NEXT: v_readfirstlane_b32 s11, v7
@@ -24073,7 +24074,6 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s88, v2
; VI-NEXT: v_readfirstlane_b32 s91, v1
; VI-NEXT: v_readfirstlane_b32 s34, v0
-; VI-NEXT: v_writelane_b32 v24, s31, 27
; VI-NEXT: s_lshr_b32 s72, s29, 16
; VI-NEXT: s_lshr_b32 s75, s28, 16
; VI-NEXT: s_lshr_b32 s78, s27, 16
@@ -24373,8 +24373,10 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s59, v9
; GFX9-NEXT: v_readfirstlane_b32 s58, v8
@@ -24387,7 +24389,6 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s88, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -24412,7 +24413,6 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
; GFX9-NEXT: s_lshr_b32 s17, s16, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s38, s18, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
@@ -25342,8 +25342,9 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v24, s34, 0
; SI-NEXT: v_writelane_b32 v24, s35, 1
-; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_writelane_b32 v24, s30, 2
+; SI-NEXT: v_writelane_b32 v24, s31, 3
+; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s5, v9
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: v_readfirstlane_b32 s7, v7
@@ -25355,7 +25356,6 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: s_cmp_lg_u32 s12, 0
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_writelane_b32 v24, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s88, s5, 16
@@ -27310,6 +27310,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s70, 20
; SI-NEXT: v_writelane_b32 v32, s71, 21
; SI-NEXT: v_writelane_b32 v32, s30, 22
+; SI-NEXT: v_writelane_b32 v32, s31, 23
; SI-NEXT: v_readfirstlane_b32 s6, v9
; SI-NEXT: v_readfirstlane_b32 s8, v8
; SI-NEXT: v_readfirstlane_b32 s10, v7
@@ -27320,7 +27321,6 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s76, v2
; SI-NEXT: v_readfirstlane_b32 s79, v1
; SI-NEXT: v_readfirstlane_b32 s89, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 23
; SI-NEXT: s_lshr_b32 s78, s29, 16
; SI-NEXT: s_lshr_b32 s90, s28, 16
; SI-NEXT: s_lshr_b32 s92, s27, 16
@@ -27716,6 +27716,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_writelane_b32 v32, s83, 25
; VI-NEXT: v_writelane_b32 v32, s30, 26
+; VI-NEXT: v_writelane_b32 v32, s31, 27
; VI-NEXT: v_readfirstlane_b32 s6, v9
; VI-NEXT: v_readfirstlane_b32 s8, v8
; VI-NEXT: v_readfirstlane_b32 s10, v7
@@ -27726,7 +27727,6 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s79, v2
; VI-NEXT: v_readfirstlane_b32 s91, v1
; VI-NEXT: v_readfirstlane_b32 s34, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 27
; VI-NEXT: s_lshr_b32 s72, s29, 16
; VI-NEXT: s_lshr_b32 s74, s28, 16
; VI-NEXT: s_lshr_b32 s77, s27, 16
@@ -28013,8 +28013,10 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s59, v9
; GFX9-NEXT: v_readfirstlane_b32 s58, v8
@@ -28027,7 +28029,6 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s88, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -28052,7 +28053,6 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
; GFX9-NEXT: s_lshr_b32 s17, s16, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s38, s18, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
@@ -28911,8 +28911,9 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v40, s34, 0
; SI-NEXT: v_writelane_b32 v40, s35, 1
-; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_writelane_b32 v40, s30, 2
+; SI-NEXT: v_writelane_b32 v40, s31, 3
+; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_readfirstlane_b32 s13, v9
; SI-NEXT: v_readfirstlane_b32 s12, v8
; SI-NEXT: v_readfirstlane_b32 s11, v7
@@ -28924,7 +28925,6 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s5, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v40, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB49_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s35, s13, 16
@@ -30991,6 +30991,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; SI-NEXT: v_writelane_b32 v24, s70, 20
; SI-NEXT: v_writelane_b32 v24, s71, 21
; SI-NEXT: v_writelane_b32 v24, s30, 22
+; SI-NEXT: v_writelane_b32 v24, s31, 23
; SI-NEXT: v_readfirstlane_b32 s7, v9
; SI-NEXT: v_readfirstlane_b32 s9, v8
; SI-NEXT: v_readfirstlane_b32 s11, v7
@@ -31001,7 +31002,6 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s88, v2
; SI-NEXT: v_readfirstlane_b32 s91, v1
; SI-NEXT: v_readfirstlane_b32 s94, v0
-; SI-NEXT: v_writelane_b32 v24, s31, 23
; SI-NEXT: s_lshr_b32 s72, s29, 16
; SI-NEXT: s_lshr_b32 s75, s28, 16
; SI-NEXT: s_lshr_b32 s78, s27, 16
@@ -31315,6 +31315,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v24, s82, 24
; VI-NEXT: v_writelane_b32 v24, s83, 25
; VI-NEXT: v_writelane_b32 v24, s30, 26
+; VI-NEXT: v_writelane_b32 v24, s31, 27
; VI-NEXT: v_readfirstlane_b32 s7, v9
; VI-NEXT: v_readfirstlane_b32 s9, v8
; VI-NEXT: v_readfirstlane_b32 s11, v7
@@ -31325,7 +31326,6 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s88, v2
; VI-NEXT: v_readfirstlane_b32 s91, v1
; VI-NEXT: v_readfirstlane_b32 s34, v0
-; VI-NEXT: v_writelane_b32 v24, s31, 27
; VI-NEXT: s_lshr_b32 s72, s29, 16
; VI-NEXT: s_lshr_b32 s75, s28, 16
; VI-NEXT: s_lshr_b32 s78, s27, 16
@@ -31625,8 +31625,10 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s59, v9
; GFX9-NEXT: v_readfirstlane_b32 s58, v8
@@ -31639,7 +31641,6 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s88, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -31664,7 +31665,6 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
; GFX9-NEXT: s_lshr_b32 s17, s16, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s38, s18, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
@@ -32522,8 +32522,9 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v40, s34, 0
; SI-NEXT: v_writelane_b32 v40, s35, 1
-; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_writelane_b32 v40, s30, 2
+; SI-NEXT: v_writelane_b32 v40, s31, 3
+; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_readfirstlane_b32 s13, v9
; SI-NEXT: v_readfirstlane_b32 s12, v8
; SI-NEXT: v_readfirstlane_b32 s11, v7
@@ -32535,7 +32536,6 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
; SI-NEXT: v_readfirstlane_b32 s5, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: v_writelane_b32 v40, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB53_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s35, s13, 16
@@ -34714,6 +34714,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; SI-NEXT: v_writelane_b32 v32, s70, 20
; SI-NEXT: v_writelane_b32 v32, s71, 21
; SI-NEXT: v_writelane_b32 v32, s30, 22
+; SI-NEXT: v_writelane_b32 v32, s31, 23
; SI-NEXT: v_readfirstlane_b32 s6, v9
; SI-NEXT: v_readfirstlane_b32 s8, v8
; SI-NEXT: v_readfirstlane_b32 s10, v7
@@ -34724,7 +34725,6 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; SI-NEXT: v_readfirstlane_b32 s76, v2
; SI-NEXT: v_readfirstlane_b32 s79, v1
; SI-NEXT: v_readfirstlane_b32 s89, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 23
; SI-NEXT: s_lshr_b32 s78, s29, 16
; SI-NEXT: s_lshr_b32 s90, s28, 16
; SI-NEXT: s_lshr_b32 s92, s27, 16
@@ -35120,6 +35120,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_writelane_b32 v32, s83, 25
; VI-NEXT: v_writelane_b32 v32, s30, 26
+; VI-NEXT: v_writelane_b32 v32, s31, 27
; VI-NEXT: v_readfirstlane_b32 s6, v9
; VI-NEXT: v_readfirstlane_b32 s8, v8
; VI-NEXT: v_readfirstlane_b32 s10, v7
@@ -35130,7 +35131,6 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; VI-NEXT: v_readfirstlane_b32 s79, v2
; VI-NEXT: v_readfirstlane_b32 s91, v1
; VI-NEXT: v_readfirstlane_b32 s34, v0
-; VI-NEXT: v_writelane_b32 v32, s31, 27
; VI-NEXT: s_lshr_b32 s72, s29, 16
; VI-NEXT: s_lshr_b32 s74, s28, 16
; VI-NEXT: s_lshr_b32 s77, s27, 16
@@ -35417,8 +35417,10 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
-; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: s_lshr_b32 s41, s16, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s59, v9
; GFX9-NEXT: v_readfirstlane_b32 s58, v8
@@ -35431,7 +35433,6 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s88, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s36, s16, s41
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -35456,7 +35457,6 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX9-NEXT: s_pack_ll_b32_b16 s37, s17, s40
; GFX9-NEXT: s_lshr_b32 s17, s16, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s38, s18, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
@@ -35695,6 +35695,22 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v48f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24
@@ -35762,22 +35778,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) #0 {
; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; kill: killed $vgpr24
; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v21
@@ -35798,22 +35798,17 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) #0 {
; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0
; SI-NEXT: ; kill: killed $vgpr24
; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v54
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v52
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v51
; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v48
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v30
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v35
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v28
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v33
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25
@@ -35843,6 +35838,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) #0 {
; SI-NEXT: s_cbranch_execz .LBB56_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53
; SI-NEXT: v_or_b32_e32 v46, v1, v24
; SI-NEXT: v_alignbit_b32 v1, v46, v58, 16
@@ -36023,6 +36019,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) #0 {
; SI-NEXT: s_mov_b32 s6, 0x30000
; SI-NEXT: v_or_b32_e32 v22, v26, v22
; SI-NEXT: v_or_b32_e32 v20, v42, v20
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v22
; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20
; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21
@@ -36828,6 +36825,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v24, s97, 31
; SI-NEXT: v_writelane_b32 v24, s98, 32
; SI-NEXT: v_writelane_b32 v24, s99, 33
+; SI-NEXT: v_writelane_b32 v24, s30, 34
+; SI-NEXT: v_writelane_b32 v24, s31, 35
; SI-NEXT: v_readfirstlane_b32 s99, v9
; SI-NEXT: v_readfirstlane_b32 s65, v8
; SI-NEXT: v_readfirstlane_b32 s96, v7
@@ -36863,9 +36862,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s51, s71, 16
; SI-NEXT: s_lshr_b32 s80, s81, 16
; SI-NEXT: v_readfirstlane_b32 s4, v10
-; SI-NEXT: v_writelane_b32 v24, s30, 34
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: v_writelane_b32 v24, s31, 35
; SI-NEXT: s_cbranch_scc0 .LBB57_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s5, s17, 0xffff
@@ -38071,6 +38068,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v48i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23
; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22
; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21
@@ -38096,7 +38094,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) #0 {
; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
@@ -38847,6 +38844,22 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s11, v9
; SI-NEXT: v_readfirstlane_b32 s6, v8
; SI-NEXT: v_readfirstlane_b32 s12, v7
@@ -38883,22 +38896,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s74, s10, 16
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB59_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB59_4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 272038cfc4881..3cf8b7aac0adb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -675,8 +675,8 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -806,8 +806,8 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -937,8 +937,8 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -1058,46 +1058,46 @@ define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v8
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -2799,8 +2799,8 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -2917,8 +2917,8 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -3035,8 +3035,8 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -3143,46 +3143,46 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v8
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -3260,11 +3260,11 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr50
@@ -3476,11 +3476,11 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v26i32_to_v52i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr41
@@ -3657,11 +3657,11 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v26i32_to_v52i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -4029,8 +4029,9 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; SI-NEXT: v_writelane_b32 v26, s38, 4
; SI-NEXT: v_writelane_b32 v26, s39, 5
; SI-NEXT: v_writelane_b32 v26, s48, 6
-; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_writelane_b32 v26, s30, 7
+; SI-NEXT: v_writelane_b32 v26, s31, 8
+; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_readfirstlane_b32 s5, v11
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_readfirstlane_b32 s7, v9
@@ -4044,7 +4045,6 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s15, v1
; SI-NEXT: s_cmp_lg_u32 s14, 0
; SI-NEXT: v_readfirstlane_b32 s14, v0
-; SI-NEXT: v_writelane_b32 v26, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s92, s5, 16
@@ -6749,10 +6749,12 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
; GFX9-NEXT: v_readfirstlane_b32 s60, v10
@@ -6769,7 +6771,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s15, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
; GFX9-NEXT: v_readfirstlane_b32 s14, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -6794,7 +6795,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s18, s15, 16
; GFX9-NEXT: s_lshr_b32 s19, s14, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v12
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s40, s20, s13
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
@@ -7042,11 +7042,11 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr50
@@ -7258,11 +7258,11 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v26i32_to_v52f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr41
@@ -7439,11 +7439,11 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v26i32_to_v52f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -7811,8 +7811,9 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: v_writelane_b32 v26, s38, 4
; SI-NEXT: v_writelane_b32 v26, s39, 5
; SI-NEXT: v_writelane_b32 v26, s48, 6
-; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_writelane_b32 v26, s30, 7
+; SI-NEXT: v_writelane_b32 v26, s31, 8
+; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_readfirstlane_b32 s5, v11
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_readfirstlane_b32 s7, v9
@@ -7826,7 +7827,6 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s15, v1
; SI-NEXT: s_cmp_lg_u32 s14, 0
; SI-NEXT: v_readfirstlane_b32 s14, v0
-; SI-NEXT: v_writelane_b32 v26, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s92, s5, 16
@@ -9962,6 +9962,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s82, 24
; SI-NEXT: v_writelane_b32 v32, s83, 25
; SI-NEXT: v_writelane_b32 v32, s30, 26
+; SI-NEXT: v_writelane_b32 v32, s31, 27
; SI-NEXT: v_readfirstlane_b32 s6, v11
; SI-NEXT: v_readfirstlane_b32 s8, v10
; SI-NEXT: v_readfirstlane_b32 s10, v9
@@ -9974,7 +9975,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s88, v2
; SI-NEXT: v_readfirstlane_b32 s91, v1
; SI-NEXT: v_readfirstlane_b32 s94, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 27
; SI-NEXT: s_lshr_b32 s90, s29, 16
; SI-NEXT: s_lshr_b32 s93, s28, 16
; SI-NEXT: s_lshr_b32 s30, s27, 16
@@ -10721,10 +10721,12 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
; GFX9-NEXT: v_readfirstlane_b32 s60, v10
@@ -10741,7 +10743,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s15, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
; GFX9-NEXT: v_readfirstlane_b32 s14, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -10766,7 +10767,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s18, s15, 16
; GFX9-NEXT: s_lshr_b32 s19, s14, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v12
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s40, s20, s13
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
@@ -11191,8 +11191,8 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -11322,8 +11322,8 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -11453,8 +11453,8 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -11574,46 +11574,46 @@ define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v8
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB21_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -12376,8 +12376,8 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -12507,8 +12507,8 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -12638,8 +12638,8 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -12759,46 +12759,46 @@ define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v8
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB25_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -13026,8 +13026,8 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -13144,8 +13144,8 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -13262,8 +13262,8 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -13370,46 +13370,46 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v8
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -13487,11 +13487,11 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr50
@@ -13703,11 +13703,11 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v26f32_to_v52i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr41
@@ -13884,11 +13884,11 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v26f32_to_v52i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -14222,7 +14222,12 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_writelane_b32 v45, s34, 0
; SI-NEXT: v_writelane_b32 v45, s35, 1
; SI-NEXT: v_writelane_b32 v45, s36, 2
@@ -14230,8 +14235,9 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v45, s38, 4
; SI-NEXT: v_writelane_b32 v45, s39, 5
; SI-NEXT: v_writelane_b32 v45, s48, 6
-; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_writelane_b32 v45, s30, 7
+; SI-NEXT: v_writelane_b32 v45, s31, 8
+; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_readfirstlane_b32 s5, v11
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_readfirstlane_b32 s7, v9
@@ -14245,12 +14251,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s15, v1
; SI-NEXT: s_cmp_lg_u32 s14, 0
; SI-NEXT: v_readfirstlane_b32 s14, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v45, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB29_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s48, s5, 16
@@ -14523,6 +14523,10 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; VI-LABEL: bitcast_v26f32_to_v52i16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_readfirstlane_b32 s6, v11
; VI-NEXT: v_readfirstlane_b32 s7, v10
@@ -14537,10 +14541,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s40, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s41, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB29_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s42, s6, 16
@@ -14768,6 +14768,10 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; GFX9-LABEL: bitcast_v26f32_to_v52i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_readfirstlane_b32 s6, v11
; GFX9-NEXT: v_readfirstlane_b32 s7, v10
@@ -14782,10 +14786,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s40, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s41, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB29_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s42, s6, 16
@@ -17291,10 +17291,12 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
; GFX9-NEXT: v_readfirstlane_b32 s60, v10
@@ -17311,7 +17313,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s15, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
; GFX9-NEXT: v_readfirstlane_b32 s14, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -17336,7 +17337,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s18, s15, 16
; GFX9-NEXT: s_lshr_b32 s19, s14, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v12
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s40, s20, s13
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
@@ -17584,11 +17584,11 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr50
@@ -17800,11 +17800,11 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v26f32_to_v52f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr41
@@ -17981,11 +17981,11 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v26f32_to_v52f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -18319,7 +18319,12 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_writelane_b32 v45, s34, 0
; SI-NEXT: v_writelane_b32 v45, s35, 1
; SI-NEXT: v_writelane_b32 v45, s36, 2
@@ -18327,8 +18332,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v45, s38, 4
; SI-NEXT: v_writelane_b32 v45, s39, 5
; SI-NEXT: v_writelane_b32 v45, s48, 6
-; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_writelane_b32 v45, s30, 7
+; SI-NEXT: v_writelane_b32 v45, s31, 8
+; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_readfirstlane_b32 s5, v11
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_readfirstlane_b32 s7, v9
@@ -18342,12 +18348,6 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s15, v1
; SI-NEXT: s_cmp_lg_u32 s14, 0
; SI-NEXT: v_readfirstlane_b32 s14, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v45, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB33_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s48, s5, 16
@@ -18620,6 +18620,10 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; VI-LABEL: bitcast_v26f32_to_v52f16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_readfirstlane_b32 s6, v11
; VI-NEXT: v_readfirstlane_b32 s7, v10
@@ -18634,10 +18638,6 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s40, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s41, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB33_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s42, s6, 16
@@ -18865,6 +18865,10 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; GFX9-LABEL: bitcast_v26f32_to_v52f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_readfirstlane_b32 s6, v11
; GFX9-NEXT: v_readfirstlane_b32 s7, v10
@@ -18879,10 +18883,6 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s40, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s41, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB33_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s42, s6, 16
@@ -20819,6 +20819,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s82, 24
; SI-NEXT: v_writelane_b32 v32, s83, 25
; SI-NEXT: v_writelane_b32 v32, s30, 26
+; SI-NEXT: v_writelane_b32 v32, s31, 27
; SI-NEXT: v_readfirstlane_b32 s6, v11
; SI-NEXT: v_readfirstlane_b32 s8, v10
; SI-NEXT: v_readfirstlane_b32 s10, v9
@@ -20831,7 +20832,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s88, v2
; SI-NEXT: v_readfirstlane_b32 s91, v1
; SI-NEXT: v_readfirstlane_b32 s94, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 27
; SI-NEXT: s_lshr_b32 s90, s29, 16
; SI-NEXT: s_lshr_b32 s93, s28, 16
; SI-NEXT: s_lshr_b32 s30, s27, 16
@@ -21578,10 +21578,12 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
; GFX9-NEXT: v_readfirstlane_b32 s60, v10
@@ -21598,7 +21600,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s15, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
; GFX9-NEXT: v_readfirstlane_b32 s14, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -21623,7 +21624,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX9-NEXT: s_lshr_b32 s18, s15, 16
; GFX9-NEXT: s_lshr_b32 s19, s14, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v12
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s40, s20, s13
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
@@ -22504,8 +22504,8 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -22622,8 +22622,8 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -22740,8 +22740,8 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -22848,46 +22848,46 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v8
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s39, s3
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -22965,11 +22965,11 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr49
@@ -23181,11 +23181,11 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v13i64_to_v52i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr41
@@ -23362,11 +23362,11 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v13i64_to_v52i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -23748,8 +23748,9 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; SI-NEXT: v_writelane_b32 v26, s38, 4
; SI-NEXT: v_writelane_b32 v26, s39, 5
; SI-NEXT: v_writelane_b32 v26, s48, 6
-; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_writelane_b32 v26, s30, 7
+; SI-NEXT: v_writelane_b32 v26, s31, 8
+; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_readfirstlane_b32 s5, v11
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_readfirstlane_b32 s7, v9
@@ -23763,7 +23764,6 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s15, v1
; SI-NEXT: s_cmp_lg_u32 s14, 0
; SI-NEXT: v_readfirstlane_b32 s14, v0
-; SI-NEXT: v_writelane_b32 v26, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s92, s5, 16
@@ -26468,10 +26468,12 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
; GFX9-NEXT: v_readfirstlane_b32 s60, v10
@@ -26488,7 +26490,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s15, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
; GFX9-NEXT: v_readfirstlane_b32 s14, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -26513,7 +26514,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s18, s15, 16
; GFX9-NEXT: s_lshr_b32 s19, s14, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v12
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s40, s20, s13
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
@@ -26761,11 +26761,11 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr49
@@ -26977,11 +26977,11 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v13i64_to_v52f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr41
@@ -27158,11 +27158,11 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v13i64_to_v52f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -27544,8 +27544,9 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: v_writelane_b32 v26, s38, 4
; SI-NEXT: v_writelane_b32 v26, s39, 5
; SI-NEXT: v_writelane_b32 v26, s48, 6
-; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_writelane_b32 v26, s30, 7
+; SI-NEXT: v_writelane_b32 v26, s31, 8
+; SI-NEXT: v_readfirstlane_b32 s14, v12
; SI-NEXT: v_readfirstlane_b32 s5, v11
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: v_readfirstlane_b32 s7, v9
@@ -27559,7 +27560,6 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s15, v1
; SI-NEXT: s_cmp_lg_u32 s14, 0
; SI-NEXT: v_readfirstlane_b32 s14, v0
-; SI-NEXT: v_writelane_b32 v26, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s92, s5, 16
@@ -29695,6 +29695,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s82, 24
; SI-NEXT: v_writelane_b32 v32, s83, 25
; SI-NEXT: v_writelane_b32 v32, s30, 26
+; SI-NEXT: v_writelane_b32 v32, s31, 27
; SI-NEXT: v_readfirstlane_b32 s6, v11
; SI-NEXT: v_readfirstlane_b32 s8, v10
; SI-NEXT: v_readfirstlane_b32 s10, v9
@@ -29707,7 +29708,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s88, v2
; SI-NEXT: v_readfirstlane_b32 s91, v1
; SI-NEXT: v_readfirstlane_b32 s94, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 27
; SI-NEXT: s_lshr_b32 s90, s29, 16
; SI-NEXT: s_lshr_b32 s93, s28, 16
; SI-NEXT: s_lshr_b32 s30, s27, 16
@@ -30454,10 +30454,12 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
; GFX9-NEXT: v_readfirstlane_b32 s60, v10
@@ -30474,7 +30476,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s15, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
; GFX9-NEXT: v_readfirstlane_b32 s14, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -30499,7 +30500,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s18, s15, 16
; GFX9-NEXT: s_lshr_b32 s19, s14, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v12
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s40, s20, s13
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
@@ -30748,11 +30748,11 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr49
@@ -30951,11 +30951,11 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v13f64_to_v52i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr41
@@ -31119,11 +31119,11 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v13f64_to_v52i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -31444,7 +31444,12 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_writelane_b32 v45, s34, 0
; SI-NEXT: v_writelane_b32 v45, s35, 1
; SI-NEXT: v_writelane_b32 v45, s36, 2
@@ -31452,8 +31457,9 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v45, s38, 4
; SI-NEXT: v_writelane_b32 v45, s39, 5
; SI-NEXT: v_writelane_b32 v45, s48, 6
-; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_writelane_b32 v45, s30, 7
+; SI-NEXT: v_writelane_b32 v45, s31, 8
+; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_readfirstlane_b32 s13, v11
; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s15, v9
@@ -31467,12 +31473,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s5, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v45, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB49_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s48, s13, 16
@@ -31736,6 +31736,10 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; VI-LABEL: bitcast_v13f64_to_v52i16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_readfirstlane_b32 s9, v11
; VI-NEXT: v_readfirstlane_b32 s8, v10
@@ -31750,10 +31754,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s5, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB49_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s42, s9, 16
@@ -31968,6 +31968,10 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; GFX9-LABEL: bitcast_v13f64_to_v52i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_readfirstlane_b32 s9, v11
; GFX9-NEXT: v_readfirstlane_b32 s8, v10
@@ -31982,10 +31986,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB49_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s42, s9, 16
@@ -34452,10 +34452,12 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
; GFX9-NEXT: v_readfirstlane_b32 s60, v10
@@ -34472,7 +34474,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s15, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
; GFX9-NEXT: v_readfirstlane_b32 s14, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -34497,7 +34498,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s18, s15, 16
; GFX9-NEXT: s_lshr_b32 s19, s14, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v12
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s40, s20, s13
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
@@ -34745,11 +34745,11 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr49
@@ -34948,11 +34948,11 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v13f64_to_v52f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr41
@@ -35116,11 +35116,11 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v13f64_to_v52f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -35441,7 +35441,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_writelane_b32 v45, s34, 0
; SI-NEXT: v_writelane_b32 v45, s35, 1
; SI-NEXT: v_writelane_b32 v45, s36, 2
@@ -35449,8 +35454,9 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_writelane_b32 v45, s38, 4
; SI-NEXT: v_writelane_b32 v45, s39, 5
; SI-NEXT: v_writelane_b32 v45, s48, 6
-; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_writelane_b32 v45, s30, 7
+; SI-NEXT: v_writelane_b32 v45, s31, 8
+; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_readfirstlane_b32 s13, v11
; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s15, v9
@@ -35464,12 +35470,6 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_readfirstlane_b32 s5, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v45, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB53_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s48, s13, 16
@@ -35733,6 +35733,10 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; VI-LABEL: bitcast_v13f64_to_v52f16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: v_readfirstlane_b32 s9, v11
; VI-NEXT: v_readfirstlane_b32 s8, v10
@@ -35747,10 +35751,6 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; VI-NEXT: v_readfirstlane_b32 s5, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB53_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s42, s9, 16
@@ -35965,6 +35965,10 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; GFX9-LABEL: bitcast_v13f64_to_v52f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: v_readfirstlane_b32 s9, v11
; GFX9-NEXT: v_readfirstlane_b32 s8, v10
@@ -35979,10 +35983,6 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB53_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s42, s9, 16
@@ -37880,6 +37880,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_writelane_b32 v32, s82, 24
; SI-NEXT: v_writelane_b32 v32, s83, 25
; SI-NEXT: v_writelane_b32 v32, s30, 26
+; SI-NEXT: v_writelane_b32 v32, s31, 27
; SI-NEXT: v_readfirstlane_b32 s6, v11
; SI-NEXT: v_readfirstlane_b32 s8, v10
; SI-NEXT: v_readfirstlane_b32 s10, v9
@@ -37892,7 +37893,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_readfirstlane_b32 s88, v2
; SI-NEXT: v_readfirstlane_b32 s91, v1
; SI-NEXT: v_readfirstlane_b32 s94, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 27
; SI-NEXT: s_lshr_b32 s90, s29, 16
; SI-NEXT: s_lshr_b32 s93, s28, 16
; SI-NEXT: s_lshr_b32 s30, s27, 16
@@ -38639,10 +38639,12 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
; GFX9-NEXT: v_readfirstlane_b32 s60, v10
@@ -38659,7 +38661,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s15, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s39, s19, s14
; GFX9-NEXT: v_readfirstlane_b32 s14, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -38684,7 +38685,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX9-NEXT: s_lshr_b32 s18, s15, 16
; GFX9-NEXT: s_lshr_b32 s19, s14, 16
; GFX9-NEXT: v_readfirstlane_b32 s40, v12
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s40, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s40, s20, s13
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
@@ -40198,6 +40198,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v26, s98, 32
; SI-NEXT: v_writelane_b32 v26, s99, 33
; SI-NEXT: v_writelane_b32 v26, s30, 34
+; SI-NEXT: v_writelane_b32 v26, s31, 35
; SI-NEXT: v_readfirstlane_b32 s85, v11
; SI-NEXT: v_readfirstlane_b32 s99, v10
; SI-NEXT: v_readfirstlane_b32 s81, v9
@@ -40210,7 +40211,6 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s97, v2
; SI-NEXT: v_readfirstlane_b32 s84, v1
; SI-NEXT: v_readfirstlane_b32 s86, v0
-; SI-NEXT: v_writelane_b32 v26, s31, 35
; SI-NEXT: s_lshr_b32 s54, s29, 16
; SI-NEXT: s_lshr_b32 s91, s28, 16
; SI-NEXT: s_lshr_b32 s53, s27, 16
@@ -40909,6 +40909,10 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX9-LABEL: bitcast_v52i16_to_v52f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s91, v11
; GFX9-NEXT: v_readfirstlane_b32 s90, v10
; GFX9-NEXT: v_readfirstlane_b32 s89, v9
@@ -40949,10 +40953,6 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s44, s72, 16
; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB57_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_cbranch_execnz .LBB57_4
@@ -42437,6 +42437,22 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s43, v11
; SI-NEXT: v_readfirstlane_b32 s57, v10
; SI-NEXT: v_readfirstlane_b32 s41, v9
@@ -42477,22 +42493,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s76, s79, 16
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB59_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB59_4
@@ -42899,6 +42899,10 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; VI-LABEL: bitcast_v52f16_to_v52i16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_readfirstlane_b32 s44, v11
; VI-NEXT: v_readfirstlane_b32 s46, v10
; VI-NEXT: v_readfirstlane_b32 s56, v9
@@ -42939,10 +42943,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; VI-NEXT: s_lshr_b32 s91, s90, 16
; VI-NEXT: v_readfirstlane_b32 s4, v12
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB59_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB59_4
@@ -43119,6 +43119,10 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX9-LABEL: bitcast_v52f16_to_v52i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s91, v11
; GFX9-NEXT: v_readfirstlane_b32 s90, v10
; GFX9-NEXT: v_readfirstlane_b32 s89, v9
@@ -43159,10 +43163,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s44, s72, 16
; GFX9-NEXT: v_readfirstlane_b32 s4, v12
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB59_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_cbranch_execnz .LBB59_4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 17370fc4b8480..34d99439c7a0b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -713,8 +713,8 @@ define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -848,8 +848,8 @@ define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -983,8 +983,8 @@ define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -1108,48 +1108,48 @@ define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -2950,8 +2950,8 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -3071,8 +3071,8 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -3192,8 +3192,8 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -3303,48 +3303,48 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -3423,7 +3423,6 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -3432,6 +3431,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr53
@@ -3667,7 +3667,6 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v28i32_to_v56i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -3676,6 +3675,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: ; implicit-def: $vgpr47
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
@@ -3868,7 +3868,6 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v28i32_to_v56i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -3877,6 +3876,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -4275,8 +4275,9 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; SI-NEXT: v_writelane_b32 v28, s51, 9
; SI-NEXT: v_writelane_b32 v28, s52, 10
; SI-NEXT: v_writelane_b32 v28, s53, 11
-; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_writelane_b32 v28, s30, 12
+; SI-NEXT: v_writelane_b32 v28, s31, 13
+; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_readfirstlane_b32 s5, v13
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_readfirstlane_b32 s7, v11
@@ -4292,7 +4293,6 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s41, v1
; SI-NEXT: s_cmp_lg_u32 s40, 0
; SI-NEXT: v_readfirstlane_b32 s40, v0
-; SI-NEXT: v_writelane_b32 v28, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s30, s5, 16
@@ -4552,8 +4552,9 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_writelane_b32 v28, s35, 1
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v28, s30, 2
+; VI-NEXT: v_writelane_b32 v28, s31, 3
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: v_readfirstlane_b32 s7, v12
; VI-NEXT: v_readfirstlane_b32 s8, v11
@@ -4569,7 +4570,6 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s42, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s43, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB13_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s44, s6, 16
@@ -6869,6 +6869,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v28, s86, 28
; VI-NEXT: v_writelane_b32 v28, s87, 29
; VI-NEXT: v_writelane_b32 v28, s30, 30
+; VI-NEXT: v_writelane_b32 v28, s31, 31
; VI-NEXT: v_readfirstlane_b32 s86, v13
; VI-NEXT: v_readfirstlane_b32 s6, v12
; VI-NEXT: v_readfirstlane_b32 s9, v11
@@ -6883,7 +6884,6 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s69, v2
; VI-NEXT: v_readfirstlane_b32 s81, v1
; VI-NEXT: v_readfirstlane_b32 s84, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 31
; VI-NEXT: s_lshr_b32 s79, s29, 16
; VI-NEXT: s_lshr_b32 s90, s28, 16
; VI-NEXT: s_lshr_b32 s31, s27, 16
@@ -7315,13 +7315,15 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
; GFX9-NEXT: s_lshr_b32 s13, s20, 16
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
; GFX9-NEXT: v_readfirstlane_b32 s62, v12
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
@@ -7342,7 +7344,6 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s13, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -7366,7 +7367,6 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s20, s13, 16
; GFX9-NEXT: s_lshr_b32 s21, s12, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v14
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s22, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
@@ -7624,7 +7624,6 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -7633,6 +7632,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr53
@@ -7868,7 +7868,6 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v28i32_to_v56f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -7877,6 +7876,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: ; implicit-def: $vgpr47
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
@@ -8069,7 +8069,6 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v28i32_to_v56f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -8078,6 +8077,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -8476,8 +8476,9 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: v_writelane_b32 v28, s51, 9
; SI-NEXT: v_writelane_b32 v28, s52, 10
; SI-NEXT: v_writelane_b32 v28, s53, 11
-; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_writelane_b32 v28, s30, 12
+; SI-NEXT: v_writelane_b32 v28, s31, 13
+; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_readfirstlane_b32 s5, v13
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_readfirstlane_b32 s7, v11
@@ -8493,7 +8494,6 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s41, v1
; SI-NEXT: s_cmp_lg_u32 s40, 0
; SI-NEXT: v_readfirstlane_b32 s40, v0
-; SI-NEXT: v_writelane_b32 v28, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s30, s5, 16
@@ -8753,8 +8753,9 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_writelane_b32 v28, s35, 1
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v28, s30, 2
+; VI-NEXT: v_writelane_b32 v28, s31, 3
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: v_readfirstlane_b32 s7, v12
; VI-NEXT: v_readfirstlane_b32 s8, v11
@@ -8770,7 +8771,6 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s42, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s43, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB17_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s44, s6, 16
@@ -10826,6 +10826,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s86, 28
; SI-NEXT: v_writelane_b32 v32, s87, 29
; SI-NEXT: v_writelane_b32 v32, s30, 30
+; SI-NEXT: v_writelane_b32 v32, s31, 31
; SI-NEXT: v_readfirstlane_b32 s6, v13
; SI-NEXT: v_readfirstlane_b32 s8, v12
; SI-NEXT: v_readfirstlane_b32 s10, v11
@@ -10840,7 +10841,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s92, v2
; SI-NEXT: v_readfirstlane_b32 s95, v1
; SI-NEXT: v_readfirstlane_b32 s34, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 31
; SI-NEXT: s_lshr_b32 s94, s29, 16
; SI-NEXT: s_lshr_b32 s30, s28, 16
; SI-NEXT: s_lshr_b32 s35, s27, 16
@@ -11295,12 +11295,13 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s84, 26
; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_writelane_b32 v32, s86, 28
-; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: v_writelane_b32 v32, s87, 29
+; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: s_lshr_b32 s15, s8, 16
; VI-NEXT: v_readfirstlane_b32 s10, v11
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: s_lshr_b32 s61, s10, 16
; VI-NEXT: v_readfirstlane_b32 s12, v10
@@ -11315,7 +11316,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s80, v1
; VI-NEXT: v_readfirstlane_b32 s83, v0
; VI-NEXT: v_writelane_b32 v33, s15, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: s_lshr_b32 s56, s29, 16
; VI-NEXT: s_lshr_b32 s75, s28, 16
; VI-NEXT: s_lshr_b32 s90, s27, 16
@@ -11656,13 +11656,15 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
; GFX9-NEXT: s_lshr_b32 s13, s20, 16
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
; GFX9-NEXT: v_readfirstlane_b32 s62, v12
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
@@ -11683,7 +11685,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s13, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -11707,7 +11708,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s20, s13, 16
; GFX9-NEXT: s_lshr_b32 s21, s12, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v14
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s22, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
@@ -12149,8 +12149,8 @@ define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -12284,8 +12284,8 @@ define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -12419,8 +12419,8 @@ define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -12544,48 +12544,48 @@ define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB21_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -13388,8 +13388,8 @@ define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -13523,8 +13523,8 @@ define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -13658,8 +13658,8 @@ define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -13783,48 +13783,48 @@ define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB25_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -14058,8 +14058,8 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -14179,8 +14179,8 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -14300,8 +14300,8 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -14411,48 +14411,48 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -14531,7 +14531,6 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -14540,6 +14539,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr53
@@ -14775,7 +14775,6 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v28f32_to_v56i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -14784,6 +14783,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: ; implicit-def: $vgpr47
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
@@ -14976,7 +14976,6 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v28f32_to_v56i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -14985,6 +14984,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -15342,7 +15342,16 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v58, s34, 0
; SI-NEXT: v_writelane_b32 v58, s35, 1
; SI-NEXT: v_writelane_b32 v58, s36, 2
@@ -15355,8 +15364,9 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v58, s51, 9
; SI-NEXT: v_writelane_b32 v58, s52, 10
; SI-NEXT: v_writelane_b32 v58, s53, 11
-; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_writelane_b32 v58, s30, 12
+; SI-NEXT: v_writelane_b32 v58, s31, 13
+; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_readfirstlane_b32 s5, v13
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_readfirstlane_b32 s7, v11
@@ -15372,17 +15382,6 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s41, v1
; SI-NEXT: s_cmp_lg_u32 s40, 0
; SI-NEXT: v_readfirstlane_b32 s40, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v58, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB29_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s53, s5, 16
@@ -15688,10 +15687,19 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v56, s34, 0
; VI-NEXT: v_writelane_b32 v56, s35, 1
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v56, s30, 2
+; VI-NEXT: v_writelane_b32 v56, s31, 3
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: v_readfirstlane_b32 s7, v12
; VI-NEXT: v_readfirstlane_b32 s8, v11
@@ -15707,15 +15715,6 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s42, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s43, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: v_writelane_b32 v56, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB29_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s44, s6, 16
@@ -15970,6 +15969,14 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; GFX9-LABEL: bitcast_v28f32_to_v56i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_readfirstlane_b32 s6, v13
; GFX9-NEXT: v_readfirstlane_b32 s7, v12
@@ -15986,14 +15993,6 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s42, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s43, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB29_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s44, s6, 16
@@ -18340,6 +18339,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v28, s86, 28
; VI-NEXT: v_writelane_b32 v28, s87, 29
; VI-NEXT: v_writelane_b32 v28, s30, 30
+; VI-NEXT: v_writelane_b32 v28, s31, 31
; VI-NEXT: v_readfirstlane_b32 s86, v13
; VI-NEXT: v_readfirstlane_b32 s6, v12
; VI-NEXT: v_readfirstlane_b32 s9, v11
@@ -18354,7 +18354,6 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s69, v2
; VI-NEXT: v_readfirstlane_b32 s81, v1
; VI-NEXT: v_readfirstlane_b32 s84, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 31
; VI-NEXT: s_lshr_b32 s79, s29, 16
; VI-NEXT: s_lshr_b32 s90, s28, 16
; VI-NEXT: s_lshr_b32 s31, s27, 16
@@ -18786,13 +18785,15 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
; GFX9-NEXT: s_lshr_b32 s13, s20, 16
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
; GFX9-NEXT: v_readfirstlane_b32 s62, v12
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
@@ -18813,7 +18814,6 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s13, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -18837,7 +18837,6 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s20, s13, 16
; GFX9-NEXT: s_lshr_b32 s21, s12, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v14
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s22, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
@@ -19095,7 +19094,6 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -19104,6 +19102,7 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr53
@@ -19339,7 +19338,6 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v28f32_to_v56f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -19348,6 +19346,7 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: ; implicit-def: $vgpr47
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
@@ -19540,7 +19539,6 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v28f32_to_v56f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -19549,6 +19547,7 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -19906,7 +19905,16 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v58, s34, 0
; SI-NEXT: v_writelane_b32 v58, s35, 1
; SI-NEXT: v_writelane_b32 v58, s36, 2
@@ -19919,8 +19927,9 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v58, s51, 9
; SI-NEXT: v_writelane_b32 v58, s52, 10
; SI-NEXT: v_writelane_b32 v58, s53, 11
-; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_writelane_b32 v58, s30, 12
+; SI-NEXT: v_writelane_b32 v58, s31, 13
+; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_readfirstlane_b32 s5, v13
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_readfirstlane_b32 s7, v11
@@ -19936,17 +19945,6 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s41, v1
; SI-NEXT: s_cmp_lg_u32 s40, 0
; SI-NEXT: v_readfirstlane_b32 s40, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v58, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB33_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s53, s5, 16
@@ -20252,10 +20250,19 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v56, s34, 0
; VI-NEXT: v_writelane_b32 v56, s35, 1
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v56, s30, 2
+; VI-NEXT: v_writelane_b32 v56, s31, 3
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: v_readfirstlane_b32 s7, v12
; VI-NEXT: v_readfirstlane_b32 s8, v11
@@ -20271,15 +20278,6 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s42, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s43, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: v_writelane_b32 v56, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB33_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s44, s6, 16
@@ -20534,6 +20532,14 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; GFX9-LABEL: bitcast_v28f32_to_v56f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_readfirstlane_b32 s6, v13
; GFX9-NEXT: v_readfirstlane_b32 s7, v12
@@ -20550,14 +20556,6 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s42, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s43, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB33_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s44, s6, 16
@@ -22660,6 +22658,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s86, 28
; SI-NEXT: v_writelane_b32 v32, s87, 29
; SI-NEXT: v_writelane_b32 v32, s30, 30
+; SI-NEXT: v_writelane_b32 v32, s31, 31
; SI-NEXT: v_readfirstlane_b32 s6, v13
; SI-NEXT: v_readfirstlane_b32 s8, v12
; SI-NEXT: v_readfirstlane_b32 s10, v11
@@ -22674,7 +22673,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s92, v2
; SI-NEXT: v_readfirstlane_b32 s95, v1
; SI-NEXT: v_readfirstlane_b32 s34, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 31
; SI-NEXT: s_lshr_b32 s94, s29, 16
; SI-NEXT: s_lshr_b32 s30, s28, 16
; SI-NEXT: s_lshr_b32 s35, s27, 16
@@ -23129,12 +23127,13 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s84, 26
; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_writelane_b32 v32, s86, 28
-; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: v_writelane_b32 v32, s87, 29
+; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: s_lshr_b32 s15, s8, 16
; VI-NEXT: v_readfirstlane_b32 s10, v11
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: s_lshr_b32 s61, s10, 16
; VI-NEXT: v_readfirstlane_b32 s12, v10
@@ -23149,7 +23148,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s80, v1
; VI-NEXT: v_readfirstlane_b32 s83, v0
; VI-NEXT: v_writelane_b32 v33, s15, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: s_lshr_b32 s56, s29, 16
; VI-NEXT: s_lshr_b32 s75, s28, 16
; VI-NEXT: s_lshr_b32 s90, s27, 16
@@ -23490,13 +23488,15 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
; GFX9-NEXT: s_lshr_b32 s13, s20, 16
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
; GFX9-NEXT: v_readfirstlane_b32 s62, v12
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
@@ -23517,7 +23517,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s13, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -23541,7 +23540,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX9-NEXT: s_lshr_b32 s20, s13, 16
; GFX9-NEXT: s_lshr_b32 s21, s12, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v14
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s22, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
@@ -24467,8 +24465,8 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s52, 8
; SI-NEXT: v_writelane_b32 v32, s53, 9
; SI-NEXT: v_writelane_b32 v32, s54, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v32, s55, 11
+; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -24588,8 +24586,8 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s52, 8
; VI-NEXT: v_writelane_b32 v32, s53, 9
; VI-NEXT: v_writelane_b32 v32, s54, 10
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v32, s55, 11
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -24709,8 +24707,8 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
-; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -24820,48 +24818,48 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -24940,7 +24938,6 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -24949,6 +24946,7 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr53
@@ -25184,7 +25182,6 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v14i64_to_v56i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -25193,6 +25190,7 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: ; implicit-def: $vgpr47
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
@@ -25385,7 +25383,6 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v14i64_to_v56i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -25394,6 +25391,7 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -25806,8 +25804,9 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; SI-NEXT: v_writelane_b32 v28, s51, 9
; SI-NEXT: v_writelane_b32 v28, s52, 10
; SI-NEXT: v_writelane_b32 v28, s53, 11
-; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_writelane_b32 v28, s30, 12
+; SI-NEXT: v_writelane_b32 v28, s31, 13
+; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_readfirstlane_b32 s5, v13
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_readfirstlane_b32 s7, v11
@@ -25823,7 +25822,6 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s41, v1
; SI-NEXT: s_cmp_lg_u32 s40, 0
; SI-NEXT: v_readfirstlane_b32 s40, v0
-; SI-NEXT: v_writelane_b32 v28, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s30, s5, 16
@@ -26083,8 +26081,9 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_writelane_b32 v28, s35, 1
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v28, s30, 2
+; VI-NEXT: v_writelane_b32 v28, s31, 3
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: v_readfirstlane_b32 s7, v12
; VI-NEXT: v_readfirstlane_b32 s8, v11
@@ -26100,7 +26099,6 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s42, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s43, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB41_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s44, s6, 16
@@ -28400,6 +28398,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v28, s86, 28
; VI-NEXT: v_writelane_b32 v28, s87, 29
; VI-NEXT: v_writelane_b32 v28, s30, 30
+; VI-NEXT: v_writelane_b32 v28, s31, 31
; VI-NEXT: v_readfirstlane_b32 s86, v13
; VI-NEXT: v_readfirstlane_b32 s6, v12
; VI-NEXT: v_readfirstlane_b32 s9, v11
@@ -28414,7 +28413,6 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s69, v2
; VI-NEXT: v_readfirstlane_b32 s81, v1
; VI-NEXT: v_readfirstlane_b32 s84, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 31
; VI-NEXT: s_lshr_b32 s79, s29, 16
; VI-NEXT: s_lshr_b32 s90, s28, 16
; VI-NEXT: s_lshr_b32 s31, s27, 16
@@ -28846,13 +28844,15 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
; GFX9-NEXT: s_lshr_b32 s13, s20, 16
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
; GFX9-NEXT: v_readfirstlane_b32 s62, v12
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
@@ -28873,7 +28873,6 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s13, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -28897,7 +28896,6 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s20, s13, 16
; GFX9-NEXT: s_lshr_b32 s21, s12, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v14
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s22, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
@@ -29155,7 +29153,6 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -29164,6 +29161,7 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr53
@@ -29399,7 +29397,6 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v14i64_to_v56f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -29408,6 +29405,7 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: ; implicit-def: $vgpr47
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
@@ -29600,7 +29598,6 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v14i64_to_v56f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -29609,6 +29606,7 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -30021,8 +30019,9 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: v_writelane_b32 v28, s51, 9
; SI-NEXT: v_writelane_b32 v28, s52, 10
; SI-NEXT: v_writelane_b32 v28, s53, 11
-; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_writelane_b32 v28, s30, 12
+; SI-NEXT: v_writelane_b32 v28, s31, 13
+; SI-NEXT: v_readfirstlane_b32 s40, v14
; SI-NEXT: v_readfirstlane_b32 s5, v13
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: v_readfirstlane_b32 s7, v11
@@ -30038,7 +30037,6 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s41, v1
; SI-NEXT: s_cmp_lg_u32 s40, 0
; SI-NEXT: v_readfirstlane_b32 s40, v0
-; SI-NEXT: v_writelane_b32 v28, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s30, s5, 16
@@ -30298,8 +30296,9 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_writelane_b32 v28, s35, 1
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v28, s30, 2
+; VI-NEXT: v_writelane_b32 v28, s31, 3
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: v_readfirstlane_b32 s7, v12
; VI-NEXT: v_readfirstlane_b32 s8, v11
@@ -30315,7 +30314,6 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s42, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s43, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB45_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s44, s6, 16
@@ -32371,6 +32369,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v32, s86, 28
; SI-NEXT: v_writelane_b32 v32, s87, 29
; SI-NEXT: v_writelane_b32 v32, s30, 30
+; SI-NEXT: v_writelane_b32 v32, s31, 31
; SI-NEXT: v_readfirstlane_b32 s6, v13
; SI-NEXT: v_readfirstlane_b32 s8, v12
; SI-NEXT: v_readfirstlane_b32 s10, v11
@@ -32385,7 +32384,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s92, v2
; SI-NEXT: v_readfirstlane_b32 s95, v1
; SI-NEXT: v_readfirstlane_b32 s34, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 31
; SI-NEXT: s_lshr_b32 s94, s29, 16
; SI-NEXT: s_lshr_b32 s30, s28, 16
; SI-NEXT: s_lshr_b32 s35, s27, 16
@@ -32840,12 +32838,13 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s84, 26
; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_writelane_b32 v32, s86, 28
-; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: v_writelane_b32 v32, s87, 29
+; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: s_lshr_b32 s15, s8, 16
; VI-NEXT: v_readfirstlane_b32 s10, v11
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: s_lshr_b32 s61, s10, 16
; VI-NEXT: v_readfirstlane_b32 s12, v10
@@ -32860,7 +32859,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s80, v1
; VI-NEXT: v_readfirstlane_b32 s83, v0
; VI-NEXT: v_writelane_b32 v33, s15, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: s_lshr_b32 s56, s29, 16
; VI-NEXT: s_lshr_b32 s75, s28, 16
; VI-NEXT: s_lshr_b32 s90, s27, 16
@@ -33201,13 +33199,15 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
; GFX9-NEXT: s_lshr_b32 s13, s20, 16
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
; GFX9-NEXT: v_readfirstlane_b32 s62, v12
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
@@ -33228,7 +33228,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s13, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -33252,7 +33251,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s20, s13, 16
; GFX9-NEXT: s_lshr_b32 s21, s12, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v14
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s22, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
@@ -33511,7 +33509,6 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -33520,6 +33517,7 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr53
@@ -33741,7 +33739,6 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v14f64_to_v56i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -33750,6 +33747,7 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: ; implicit-def: $vgpr47
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
@@ -33928,7 +33926,6 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v14f64_to_v56i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -33937,6 +33934,7 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -34280,7 +34278,16 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v58, s34, 0
; SI-NEXT: v_writelane_b32 v58, s35, 1
; SI-NEXT: v_writelane_b32 v58, s36, 2
@@ -34293,8 +34300,9 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v58, s51, 9
; SI-NEXT: v_writelane_b32 v58, s52, 10
; SI-NEXT: v_writelane_b32 v58, s53, 11
-; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v58, s30, 12
+; SI-NEXT: v_writelane_b32 v58, s31, 13
+; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_readfirstlane_b32 s41, v13
; SI-NEXT: v_readfirstlane_b32 s40, v12
; SI-NEXT: v_readfirstlane_b32 s15, v11
@@ -34310,17 +34318,6 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s5, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v58, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB49_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s53, s41, 16
@@ -34618,10 +34615,19 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v56, s34, 0
; VI-NEXT: v_writelane_b32 v56, s35, 1
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v56, s30, 2
+; VI-NEXT: v_writelane_b32 v56, s31, 3
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_readfirstlane_b32 s9, v13
; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: v_readfirstlane_b32 s11, v11
@@ -34637,15 +34643,6 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s5, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: v_writelane_b32 v56, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB49_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s44, s9, 16
@@ -34886,6 +34883,14 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; GFX9-LABEL: bitcast_v14f64_to_v56i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_readfirstlane_b32 s9, v13
; GFX9-NEXT: v_readfirstlane_b32 s8, v12
@@ -34902,14 +34907,6 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB49_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s44, s9, 16
@@ -37215,6 +37212,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v28, s86, 28
; VI-NEXT: v_writelane_b32 v28, s87, 29
; VI-NEXT: v_writelane_b32 v28, s30, 30
+; VI-NEXT: v_writelane_b32 v28, s31, 31
; VI-NEXT: v_readfirstlane_b32 s86, v13
; VI-NEXT: v_readfirstlane_b32 s6, v12
; VI-NEXT: v_readfirstlane_b32 s9, v11
@@ -37229,7 +37227,6 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s69, v2
; VI-NEXT: v_readfirstlane_b32 s81, v1
; VI-NEXT: v_readfirstlane_b32 s84, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 31
; VI-NEXT: s_lshr_b32 s79, s29, 16
; VI-NEXT: s_lshr_b32 s90, s28, 16
; VI-NEXT: s_lshr_b32 s31, s27, 16
@@ -37661,13 +37658,15 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
; GFX9-NEXT: s_lshr_b32 s13, s20, 16
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
; GFX9-NEXT: v_readfirstlane_b32 s62, v12
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
@@ -37688,7 +37687,6 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s13, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -37712,7 +37710,6 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s20, s13, 16
; GFX9-NEXT: s_lshr_b32 s21, s12, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v14
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s22, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
@@ -37970,7 +37967,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -37979,6 +37975,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr53
@@ -38200,7 +38197,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v14f64_to_v56f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -38209,6 +38205,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; VI-NEXT: ; implicit-def: $vgpr47
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
@@ -38387,7 +38384,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v14f64_to_v56f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -38396,6 +38392,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -38739,7 +38736,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v58, s34, 0
; SI-NEXT: v_writelane_b32 v58, s35, 1
; SI-NEXT: v_writelane_b32 v58, s36, 2
@@ -38752,8 +38758,9 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: v_writelane_b32 v58, s51, 9
; SI-NEXT: v_writelane_b32 v58, s52, 10
; SI-NEXT: v_writelane_b32 v58, s53, 11
-; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_writelane_b32 v58, s30, 12
+; SI-NEXT: v_writelane_b32 v58, s31, 13
+; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_readfirstlane_b32 s41, v13
; SI-NEXT: v_readfirstlane_b32 s40, v12
; SI-NEXT: v_readfirstlane_b32 s15, v11
@@ -38769,17 +38776,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: v_readfirstlane_b32 s5, v1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v58, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB53_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s53, s41, 16
@@ -39077,10 +39073,19 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v56, s34, 0
; VI-NEXT: v_writelane_b32 v56, s35, 1
-; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_writelane_b32 v56, s30, 2
+; VI-NEXT: v_writelane_b32 v56, s31, 3
+; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: v_readfirstlane_b32 s9, v13
; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: v_readfirstlane_b32 s11, v11
@@ -39096,15 +39101,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; VI-NEXT: v_readfirstlane_b32 s5, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: v_writelane_b32 v56, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB53_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s44, s9, 16
@@ -39345,6 +39341,14 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; GFX9-LABEL: bitcast_v14f64_to_v56f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: v_readfirstlane_b32 s9, v13
; GFX9-NEXT: v_readfirstlane_b32 s8, v12
@@ -39361,14 +39365,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB53_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s44, s9, 16
@@ -41430,6 +41426,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_writelane_b32 v32, s86, 28
; SI-NEXT: v_writelane_b32 v32, s87, 29
; SI-NEXT: v_writelane_b32 v32, s30, 30
+; SI-NEXT: v_writelane_b32 v32, s31, 31
; SI-NEXT: v_readfirstlane_b32 s6, v13
; SI-NEXT: v_readfirstlane_b32 s8, v12
; SI-NEXT: v_readfirstlane_b32 s10, v11
@@ -41444,7 +41441,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_readfirstlane_b32 s92, v2
; SI-NEXT: v_readfirstlane_b32 s95, v1
; SI-NEXT: v_readfirstlane_b32 s34, v0
-; SI-NEXT: v_writelane_b32 v32, s31, 31
; SI-NEXT: s_lshr_b32 s94, s29, 16
; SI-NEXT: s_lshr_b32 s30, s28, 16
; SI-NEXT: s_lshr_b32 s35, s27, 16
@@ -41899,12 +41895,13 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; VI-NEXT: v_writelane_b32 v32, s84, 26
; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_writelane_b32 v32, s86, 28
-; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: v_writelane_b32 v32, s87, 29
+; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: s_lshr_b32 s15, s8, 16
; VI-NEXT: v_readfirstlane_b32 s10, v11
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: v_readfirstlane_b32 s6, v13
; VI-NEXT: s_lshr_b32 s61, s10, 16
; VI-NEXT: v_readfirstlane_b32 s12, v10
@@ -41919,7 +41916,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; VI-NEXT: v_readfirstlane_b32 s80, v1
; VI-NEXT: v_readfirstlane_b32 s83, v0
; VI-NEXT: v_writelane_b32 v33, s15, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: s_lshr_b32 s56, s29, 16
; VI-NEXT: s_lshr_b32 s75, s28, 16
; VI-NEXT: s_lshr_b32 s90, s27, 16
@@ -42260,13 +42256,15 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s50, 6
; GFX9-NEXT: v_writelane_b32 v32, s51, 7
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
+; GFX9-NEXT: v_writelane_b32 v32, s53, 9
+; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
; GFX9-NEXT: s_lshr_b32 s13, s20, 16
; GFX9-NEXT: s_lshr_b32 s14, s19, 16
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
; GFX9-NEXT: v_readfirstlane_b32 s62, v12
; GFX9-NEXT: v_readfirstlane_b32 s61, v11
@@ -42287,7 +42285,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s13, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s21, s12
; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -42311,7 +42308,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX9-NEXT: s_lshr_b32 s20, s13, 16
; GFX9-NEXT: s_lshr_b32 s21, s12, 16
; GFX9-NEXT: v_readfirstlane_b32 s42, v14
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: s_cmp_lg_u32 s42, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s22, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
@@ -42570,6 +42566,22 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
@@ -42594,23 +42606,6 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -43961,6 +43956,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v28, s98, 32
; SI-NEXT: v_writelane_b32 v28, s99, 33
; SI-NEXT: v_writelane_b32 v28, s30, 34
+; SI-NEXT: v_writelane_b32 v28, s31, 35
; SI-NEXT: v_readfirstlane_b32 s54, v13
; SI-NEXT: v_readfirstlane_b32 s55, v12
; SI-NEXT: v_readfirstlane_b32 s52, v11
@@ -43975,7 +43971,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s84, v2
; SI-NEXT: v_readfirstlane_b32 s97, v1
; SI-NEXT: v_readfirstlane_b32 s99, v0
-; SI-NEXT: v_writelane_b32 v28, s31, 35
; SI-NEXT: s_lshr_b32 s64, s29, 16
; SI-NEXT: s_lshr_b32 s37, s28, 16
; SI-NEXT: s_lshr_b32 s65, s27, 16
@@ -44562,6 +44557,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_writelane_b32 v28, s35, 1
; VI-NEXT: v_writelane_b32 v28, s30, 2
+; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: v_readfirstlane_b32 s7, v13
; VI-NEXT: v_readfirstlane_b32 s8, v12
; VI-NEXT: v_readfirstlane_b32 s10, v11
@@ -44576,7 +44572,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s76, v2
; VI-NEXT: v_readfirstlane_b32 s79, v1
; VI-NEXT: v_readfirstlane_b32 s90, v0
-; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_lshr_b32 s42, s29, 16
; VI-NEXT: s_lshr_b32 s45, s28, 16
; VI-NEXT: s_lshr_b32 s46, s27, 16
@@ -44795,6 +44790,14 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX9-LABEL: bitcast_v56i16_to_v56f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s95, v13
; GFX9-NEXT: v_readfirstlane_b32 s94, v12
; GFX9-NEXT: v_readfirstlane_b32 s93, v11
@@ -44839,14 +44842,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s44, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB57_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_cbranch_execnz .LBB57_4
@@ -46463,6 +46458,22 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; SI-LABEL: bitcast_v56f16_to_v56i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s44, v13
; SI-NEXT: v_readfirstlane_b32 s60, v12
; SI-NEXT: v_readfirstlane_b32 s42, v11
@@ -46507,22 +46518,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s88, s92, 16
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB59_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB59_4
@@ -46963,6 +46958,14 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v56, s34, 0
; VI-NEXT: v_writelane_b32 v56, s35, 1
; VI-NEXT: v_writelane_b32 v56, s30, 2
@@ -47011,14 +47014,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; VI-NEXT: s_lshr_b32 s35, s34, 16
; VI-NEXT: v_readfirstlane_b32 s4, v14
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB59_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB59_4
@@ -47218,6 +47213,14 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX9-LABEL: bitcast_v56f16_to_v56i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_readfirstlane_b32 s95, v13
; GFX9-NEXT: v_readfirstlane_b32 s94, v12
; GFX9-NEXT: v_readfirstlane_b32 s93, v11
@@ -47262,14 +47265,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s44, s74, 16
; GFX9-NEXT: v_readfirstlane_b32 s4, v14
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB59_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_cbranch_execnz .LBB59_4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 8c5d66f1227ea..c304c435d2203 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -753,8 +753,8 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
-; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -896,8 +896,8 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -1039,8 +1039,8 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -1170,52 +1170,52 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -3120,8 +3120,8 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
-; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -3248,8 +3248,8 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -3376,8 +3376,8 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -3492,52 +3492,52 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -3619,7 +3619,6 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -3632,6 +3631,7 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr41
@@ -3885,7 +3885,6 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v30i32_to_v60i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -3898,6 +3897,7 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr57
@@ -4106,7 +4106,6 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v30i32_to_v60i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -4119,6 +4118,7 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -4546,8 +4546,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_writelane_b32 v30, s54, 12
; SI-NEXT: v_writelane_b32 v30, s55, 13
; SI-NEXT: v_writelane_b32 v30, s64, 14
-; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_writelane_b32 v30, s30, 15
+; SI-NEXT: v_writelane_b32 v30, s31, 16
+; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_readfirstlane_b32 s5, v15
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_readfirstlane_b32 s7, v13
@@ -4565,7 +4566,6 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: s_cmp_lg_u32 s42, 0
; SI-NEXT: v_readfirstlane_b32 s42, v0
-; SI-NEXT: v_writelane_b32 v30, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s34, s5, 16
@@ -4848,8 +4848,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_writelane_b32 v30, s37, 3
; VI-NEXT: v_writelane_b32 v30, s38, 4
; VI-NEXT: v_writelane_b32 v30, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v30, s30, 6
+; VI-NEXT: v_writelane_b32 v30, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_readfirstlane_b32 s7, v14
; VI-NEXT: v_readfirstlane_b32 s8, v13
@@ -4867,7 +4868,6 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s44, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s45, v0
-; VI-NEXT: v_writelane_b32 v30, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB13_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s46, s6, 16
@@ -5137,8 +5137,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v30, s34, 0
; GFX9-NEXT: v_writelane_b32 v30, s35, 1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v30, s30, 2
+; GFX9-NEXT: v_writelane_b32 v30, s31, 3
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_readfirstlane_b32 s6, v15
; GFX9-NEXT: v_readfirstlane_b32 s7, v14
; GFX9-NEXT: v_readfirstlane_b32 s8, v13
@@ -5156,7 +5157,6 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s45, v0
-; GFX9-NEXT: v_writelane_b32 v30, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB13_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s6, 16
@@ -7356,20 +7356,20 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v30, s83, 25
; VI-NEXT: v_writelane_b32 v30, s84, 26
; VI-NEXT: v_writelane_b32 v30, s85, 27
-; VI-NEXT: v_readfirstlane_b32 s11, v13
; VI-NEXT: v_writelane_b32 v30, s86, 28
+; VI-NEXT: v_writelane_b32 v30, s87, 29
+; VI-NEXT: v_writelane_b32 v30, s30, 30
+; VI-NEXT: v_writelane_b32 v30, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s11, v13
; VI-NEXT: s_lshr_b32 s63, s11, 16
; VI-NEXT: v_readfirstlane_b32 s13, v12
; VI-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v30, s87, 29
; VI-NEXT: s_lshr_b32 s62, s13, 16
; VI-NEXT: v_readfirstlane_b32 s15, v11
; VI-NEXT: v_writelane_b32 v31, s63, 0
-; VI-NEXT: v_writelane_b32 v30, s30, 30
; VI-NEXT: s_lshr_b32 s61, s15, 16
; VI-NEXT: v_readfirstlane_b32 s73, v10
; VI-NEXT: v_writelane_b32 v31, s62, 1
-; VI-NEXT: v_writelane_b32 v30, s31, 31
; VI-NEXT: v_readfirstlane_b32 s7, v15
; VI-NEXT: v_readfirstlane_b32 s9, v14
; VI-NEXT: s_lshr_b32 s60, s73, 16
@@ -7759,6 +7759,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_writelane_b32 v32, s64, 12
+; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_lshr_b32 s10, s23, 16
; GFX9-NEXT: s_lshr_b32 s11, s22, 16
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
@@ -7767,7 +7770,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s72, v15
; GFX9-NEXT: v_readfirstlane_b32 s74, v14
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
@@ -7792,7 +7794,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s11, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
; GFX9-NEXT: v_readfirstlane_b32 s10, v0
-; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -7816,7 +7817,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s22, s11, 16
; GFX9-NEXT: s_lshr_b32 s23, s10, 16
; GFX9-NEXT: v_readfirstlane_b32 s44, v16
-; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s44, s24, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s45, s25, s8
@@ -8086,7 +8086,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -8099,6 +8098,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr41
@@ -8352,7 +8352,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v30i32_to_v60f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -8365,6 +8364,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr57
@@ -8573,7 +8573,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v30i32_to_v60f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -8586,6 +8585,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -9013,8 +9013,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: v_writelane_b32 v30, s54, 12
; SI-NEXT: v_writelane_b32 v30, s55, 13
; SI-NEXT: v_writelane_b32 v30, s64, 14
-; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_writelane_b32 v30, s30, 15
+; SI-NEXT: v_writelane_b32 v30, s31, 16
+; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_readfirstlane_b32 s5, v15
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_readfirstlane_b32 s7, v13
@@ -9032,7 +9033,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: s_cmp_lg_u32 s42, 0
; SI-NEXT: v_readfirstlane_b32 s42, v0
-; SI-NEXT: v_writelane_b32 v30, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s34, s5, 16
@@ -9315,8 +9315,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: v_writelane_b32 v30, s37, 3
; VI-NEXT: v_writelane_b32 v30, s38, 4
; VI-NEXT: v_writelane_b32 v30, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v30, s30, 6
+; VI-NEXT: v_writelane_b32 v30, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_readfirstlane_b32 s7, v14
; VI-NEXT: v_readfirstlane_b32 s8, v13
@@ -9334,7 +9335,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s44, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s45, v0
-; VI-NEXT: v_writelane_b32 v30, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB17_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s46, s6, 16
@@ -9604,8 +9604,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v30, s34, 0
; GFX9-NEXT: v_writelane_b32 v30, s35, 1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v30, s30, 2
+; GFX9-NEXT: v_writelane_b32 v30, s31, 3
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_readfirstlane_b32 s6, v15
; GFX9-NEXT: v_readfirstlane_b32 s7, v14
; GFX9-NEXT: v_readfirstlane_b32 s8, v13
@@ -9623,7 +9624,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s45, v0
-; GFX9-NEXT: v_writelane_b32 v30, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB17_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s6, 16
@@ -12059,28 +12059,28 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s81, 23
; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_writelane_b32 v32, s83, 25
-; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_writelane_b32 v32, s84, 26
+; VI-NEXT: v_writelane_b32 v32, s85, 27
+; VI-NEXT: v_writelane_b32 v32, s86, 28
+; VI-NEXT: v_writelane_b32 v32, s87, 29
+; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: s_lshr_b32 vcc_lo, s6, 16
; VI-NEXT: v_readfirstlane_b32 s8, v14
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: s_lshr_b32 vcc_hi, s8, 16
; VI-NEXT: v_readfirstlane_b32 s10, v13
; VI-NEXT: v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: s_lshr_b32 s63, s10, 16
; VI-NEXT: v_readfirstlane_b32 s12, v12
; VI-NEXT: v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT: v_writelane_b32 v32, s87, 29
; VI-NEXT: s_lshr_b32 s62, s12, 16
; VI-NEXT: v_readfirstlane_b32 s14, v11
; VI-NEXT: v_writelane_b32 v33, s63, 2
-; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: s_lshr_b32 s61, s14, 16
; VI-NEXT: v_readfirstlane_b32 s72, v10
; VI-NEXT: v_writelane_b32 v33, s62, 3
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: s_lshr_b32 s60, s72, 16
; VI-NEXT: v_readfirstlane_b32 s74, v9
; VI-NEXT: v_readfirstlane_b32 s76, v8
@@ -12447,6 +12447,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_writelane_b32 v32, s64, 12
+; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_lshr_b32 s10, s23, 16
; GFX9-NEXT: s_lshr_b32 s11, s22, 16
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
@@ -12455,7 +12458,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s72, v15
; GFX9-NEXT: v_readfirstlane_b32 s74, v14
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
@@ -12480,7 +12482,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s11, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
; GFX9-NEXT: v_readfirstlane_b32 s10, v0
-; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -12504,7 +12505,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s22, s11, 16
; GFX9-NEXT: s_lshr_b32 s23, s10, 16
; GFX9-NEXT: v_readfirstlane_b32 s44, v16
-; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s44, s24, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s45, s25, s8
@@ -12967,8 +12967,8 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
-; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -13110,8 +13110,8 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -13253,8 +13253,8 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -13384,52 +13384,52 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
; GFX11-NEXT: s_cbranch_scc0 .LBB21_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -14277,8 +14277,8 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
-; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -14420,8 +14420,8 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -14563,8 +14563,8 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -14694,52 +14694,52 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
; GFX11-NEXT: s_cbranch_scc0 .LBB25_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -14983,8 +14983,8 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
-; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -15111,8 +15111,8 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -15239,8 +15239,8 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -15355,52 +15355,52 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -15482,7 +15482,6 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -15495,6 +15494,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr41
@@ -15748,7 +15748,6 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v30f32_to_v60i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -15761,6 +15760,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr57
@@ -15969,7 +15969,6 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v30f32_to_v60i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -15982,6 +15981,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -16363,7 +16363,20 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v62, s34, 0
; SI-NEXT: v_writelane_b32 v62, s35, 1
; SI-NEXT: v_writelane_b32 v62, s36, 2
@@ -16379,8 +16392,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v62, s54, 12
; SI-NEXT: v_writelane_b32 v62, s55, 13
; SI-NEXT: v_writelane_b32 v62, s64, 14
-; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_writelane_b32 v62, s30, 15
+; SI-NEXT: v_writelane_b32 v62, s31, 16
+; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_readfirstlane_b32 s5, v15
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_readfirstlane_b32 s7, v13
@@ -16398,21 +16412,6 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: s_cmp_lg_u32 s42, 0
; SI-NEXT: v_readfirstlane_b32 s42, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v62, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB29_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s64, s5, 16
@@ -16743,14 +16742,27 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v60, s34, 0
; VI-NEXT: v_writelane_b32 v60, s35, 1
; VI-NEXT: v_writelane_b32 v60, s36, 2
; VI-NEXT: v_writelane_b32 v60, s37, 3
; VI-NEXT: v_writelane_b32 v60, s38, 4
; VI-NEXT: v_writelane_b32 v60, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v60, s30, 6
+; VI-NEXT: v_writelane_b32 v60, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_readfirstlane_b32 s7, v14
; VI-NEXT: v_readfirstlane_b32 s8, v13
@@ -16768,19 +16780,6 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s44, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s45, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: v_writelane_b32 v60, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB29_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s46, s6, 16
@@ -17062,10 +17061,23 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v60, s34, 0
; GFX9-NEXT: v_writelane_b32 v60, s35, 1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v60, s30, 2
+; GFX9-NEXT: v_writelane_b32 v60, s31, 3
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_readfirstlane_b32 s6, v15
; GFX9-NEXT: v_readfirstlane_b32 s7, v14
; GFX9-NEXT: v_readfirstlane_b32 s8, v13
@@ -17083,19 +17095,6 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s45, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v60, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB29_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s6, 16
@@ -19626,20 +19625,20 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v30, s83, 25
; VI-NEXT: v_writelane_b32 v30, s84, 26
; VI-NEXT: v_writelane_b32 v30, s85, 27
-; VI-NEXT: v_readfirstlane_b32 s11, v13
; VI-NEXT: v_writelane_b32 v30, s86, 28
+; VI-NEXT: v_writelane_b32 v30, s87, 29
+; VI-NEXT: v_writelane_b32 v30, s30, 30
+; VI-NEXT: v_writelane_b32 v30, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s11, v13
; VI-NEXT: s_lshr_b32 s63, s11, 16
; VI-NEXT: v_readfirstlane_b32 s13, v12
; VI-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v30, s87, 29
; VI-NEXT: s_lshr_b32 s62, s13, 16
; VI-NEXT: v_readfirstlane_b32 s15, v11
; VI-NEXT: v_writelane_b32 v31, s63, 0
-; VI-NEXT: v_writelane_b32 v30, s30, 30
; VI-NEXT: s_lshr_b32 s61, s15, 16
; VI-NEXT: v_readfirstlane_b32 s73, v10
; VI-NEXT: v_writelane_b32 v31, s62, 1
-; VI-NEXT: v_writelane_b32 v30, s31, 31
; VI-NEXT: v_readfirstlane_b32 s7, v15
; VI-NEXT: v_readfirstlane_b32 s9, v14
; VI-NEXT: s_lshr_b32 s60, s73, 16
@@ -20029,6 +20028,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_writelane_b32 v32, s64, 12
+; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_lshr_b32 s10, s23, 16
; GFX9-NEXT: s_lshr_b32 s11, s22, 16
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
@@ -20037,7 +20039,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s72, v15
; GFX9-NEXT: v_readfirstlane_b32 s74, v14
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
@@ -20062,7 +20063,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s11, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
; GFX9-NEXT: v_readfirstlane_b32 s10, v0
-; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -20086,7 +20086,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s22, s11, 16
; GFX9-NEXT: s_lshr_b32 s23, s10, 16
; GFX9-NEXT: v_readfirstlane_b32 s44, v16
-; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s44, s24, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s45, s25, s8
@@ -20356,7 +20355,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -20369,6 +20367,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr41
@@ -20622,7 +20621,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v30f32_to_v60f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -20635,6 +20633,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr57
@@ -20843,7 +20842,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v30f32_to_v60f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -20856,6 +20854,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -21237,7 +21236,20 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v62, s34, 0
; SI-NEXT: v_writelane_b32 v62, s35, 1
; SI-NEXT: v_writelane_b32 v62, s36, 2
@@ -21253,8 +21265,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_writelane_b32 v62, s54, 12
; SI-NEXT: v_writelane_b32 v62, s55, 13
; SI-NEXT: v_writelane_b32 v62, s64, 14
-; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_writelane_b32 v62, s30, 15
+; SI-NEXT: v_writelane_b32 v62, s31, 16
+; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_readfirstlane_b32 s5, v15
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_readfirstlane_b32 s7, v13
@@ -21272,21 +21285,6 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: s_cmp_lg_u32 s42, 0
; SI-NEXT: v_readfirstlane_b32 s42, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v62, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB33_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s64, s5, 16
@@ -21617,14 +21615,27 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v60, s34, 0
; VI-NEXT: v_writelane_b32 v60, s35, 1
; VI-NEXT: v_writelane_b32 v60, s36, 2
; VI-NEXT: v_writelane_b32 v60, s37, 3
; VI-NEXT: v_writelane_b32 v60, s38, 4
; VI-NEXT: v_writelane_b32 v60, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v60, s30, 6
+; VI-NEXT: v_writelane_b32 v60, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_readfirstlane_b32 s7, v14
; VI-NEXT: v_readfirstlane_b32 s8, v13
@@ -21642,19 +21653,6 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s44, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s45, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: v_writelane_b32 v60, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB33_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s46, s6, 16
@@ -21936,10 +21934,23 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v60, s34, 0
; GFX9-NEXT: v_writelane_b32 v60, s35, 1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v60, s30, 2
+; GFX9-NEXT: v_writelane_b32 v60, s31, 3
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_readfirstlane_b32 s6, v15
; GFX9-NEXT: v_readfirstlane_b32 s7, v14
; GFX9-NEXT: v_readfirstlane_b32 s8, v13
@@ -21957,19 +21968,6 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s45, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v60, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB33_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s6, 16
@@ -24736,28 +24734,28 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s81, 23
; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_writelane_b32 v32, s83, 25
-; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_writelane_b32 v32, s84, 26
+; VI-NEXT: v_writelane_b32 v32, s85, 27
+; VI-NEXT: v_writelane_b32 v32, s86, 28
+; VI-NEXT: v_writelane_b32 v32, s87, 29
+; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: s_lshr_b32 vcc_lo, s6, 16
; VI-NEXT: v_readfirstlane_b32 s8, v14
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: s_lshr_b32 vcc_hi, s8, 16
; VI-NEXT: v_readfirstlane_b32 s10, v13
; VI-NEXT: v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: s_lshr_b32 s63, s10, 16
; VI-NEXT: v_readfirstlane_b32 s12, v12
; VI-NEXT: v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT: v_writelane_b32 v32, s87, 29
; VI-NEXT: s_lshr_b32 s62, s12, 16
; VI-NEXT: v_readfirstlane_b32 s14, v11
; VI-NEXT: v_writelane_b32 v33, s63, 2
-; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: s_lshr_b32 s61, s14, 16
; VI-NEXT: v_readfirstlane_b32 s72, v10
; VI-NEXT: v_writelane_b32 v33, s62, 3
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: s_lshr_b32 s60, s72, 16
; VI-NEXT: v_readfirstlane_b32 s74, v9
; VI-NEXT: v_readfirstlane_b32 s76, v8
@@ -25124,6 +25122,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_writelane_b32 v32, s64, 12
+; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_lshr_b32 s10, s23, 16
; GFX9-NEXT: s_lshr_b32 s11, s22, 16
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
@@ -25132,7 +25133,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s72, v15
; GFX9-NEXT: v_readfirstlane_b32 s74, v14
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
@@ -25157,7 +25157,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s11, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
; GFX9-NEXT: v_readfirstlane_b32 s10, v0
-; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -25181,7 +25180,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX9-NEXT: s_lshr_b32 s22, s11, 16
; GFX9-NEXT: s_lshr_b32 s23, s10, 16
; GFX9-NEXT: v_readfirstlane_b32 s44, v16
-; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s44, s24, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s45, s25, s8
@@ -26157,8 +26155,8 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v32, s54, 10
; SI-NEXT: v_writelane_b32 v32, s55, 11
; SI-NEXT: v_writelane_b32 v32, s64, 12
-; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: v_writelane_b32 v32, s65, 13
+; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_mov_b32 s49, s29
; SI-NEXT: s_mov_b32 s48, s28
; SI-NEXT: s_mov_b32 s47, s27
@@ -26285,8 +26283,8 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; VI-NEXT: v_writelane_b32 v32, s54, 10
; VI-NEXT: v_writelane_b32 v32, s55, 11
; VI-NEXT: v_writelane_b32 v32, s64, 12
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v32, s65, 13
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: s_mov_b32 s49, s29
; VI-NEXT: s_mov_b32 s48, s28
; VI-NEXT: s_mov_b32 s47, s27
@@ -26413,8 +26411,8 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_writelane_b32 v32, s64, 12
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v32, s65, 13
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: s_mov_b32 s49, s29
; GFX9-NEXT: s_mov_b32 s48, s28
; GFX9-NEXT: s_mov_b32 s47, s27
@@ -26529,52 +26527,52 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
; GFX11-NEXT: v_writelane_b32 v32, s36, 0
+; GFX11-NEXT: v_writelane_b32 v32, s37, 1
+; GFX11-NEXT: v_writelane_b32 v32, s38, 2
+; GFX11-NEXT: v_writelane_b32 v32, s39, 3
+; GFX11-NEXT: v_writelane_b32 v32, s48, 4
+; GFX11-NEXT: v_writelane_b32 v32, s49, 5
+; GFX11-NEXT: v_writelane_b32 v32, s50, 6
+; GFX11-NEXT: v_writelane_b32 v32, s51, 7
+; GFX11-NEXT: v_writelane_b32 v32, s52, 8
+; GFX11-NEXT: v_writelane_b32 v32, s53, 9
+; GFX11-NEXT: v_writelane_b32 v32, s54, 10
+; GFX11-NEXT: v_writelane_b32 v32, s55, 11
+; GFX11-NEXT: v_writelane_b32 v32, s64, 12
+; GFX11-NEXT: v_writelane_b32 v32, s65, 13
; GFX11-NEXT: s_mov_b32 s36, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v12
+; GFX11-NEXT: v_readfirstlane_b32 s65, v11
+; GFX11-NEXT: v_readfirstlane_b32 s64, v10
; GFX11-NEXT: v_readfirstlane_b32 s63, v9
; GFX11-NEXT: v_readfirstlane_b32 s62, v8
-; GFX11-NEXT: v_writelane_b32 v32, s37, 1
; GFX11-NEXT: v_readfirstlane_b32 s61, v7
; GFX11-NEXT: v_readfirstlane_b32 s60, v6
; GFX11-NEXT: v_readfirstlane_b32 s59, v5
; GFX11-NEXT: v_readfirstlane_b32 s58, v4
-; GFX11-NEXT: v_writelane_b32 v32, s38, 2
; GFX11-NEXT: v_readfirstlane_b32 s57, v3
; GFX11-NEXT: v_readfirstlane_b32 s56, v2
+; GFX11-NEXT: v_readfirstlane_b32 s55, v1
+; GFX11-NEXT: v_readfirstlane_b32 s54, v0
+; GFX11-NEXT: s_mov_b32 s53, s29
+; GFX11-NEXT: s_mov_b32 s52, s28
+; GFX11-NEXT: s_mov_b32 s51, s27
+; GFX11-NEXT: s_mov_b32 s50, s26
+; GFX11-NEXT: s_mov_b32 s49, s25
+; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s47, s23
; GFX11-NEXT: s_mov_b32 s46, s22
-; GFX11-NEXT: v_writelane_b32 v32, s39, 3
; GFX11-NEXT: s_mov_b32 s45, s21
; GFX11-NEXT: s_mov_b32 s44, s20
; GFX11-NEXT: s_mov_b32 s43, s19
; GFX11-NEXT: s_mov_b32 s42, s18
-; GFX11-NEXT: v_writelane_b32 v32, s48, 4
-; GFX11-NEXT: s_mov_b32 s48, s24
; GFX11-NEXT: s_mov_b32 s41, s17
; GFX11-NEXT: s_mov_b32 s40, s16
; GFX11-NEXT: s_mov_b32 s39, s3
-; GFX11-NEXT: v_writelane_b32 v32, s49, 5
-; GFX11-NEXT: s_mov_b32 s49, s25
; GFX11-NEXT: s_mov_b32 s38, s2
; GFX11-NEXT: s_mov_b32 s37, s1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s50, 6
-; GFX11-NEXT: s_mov_b32 s50, s26
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_writelane_b32 v32, s51, 7
-; GFX11-NEXT: s_mov_b32 s51, s27
-; GFX11-NEXT: v_writelane_b32 v32, s52, 8
-; GFX11-NEXT: s_mov_b32 s52, s28
-; GFX11-NEXT: v_writelane_b32 v32, s53, 9
-; GFX11-NEXT: s_mov_b32 s53, s29
-; GFX11-NEXT: v_writelane_b32 v32, s54, 10
-; GFX11-NEXT: v_readfirstlane_b32 s54, v0
-; GFX11-NEXT: v_writelane_b32 v32, s55, 11
-; GFX11-NEXT: v_readfirstlane_b32 s55, v1
-; GFX11-NEXT: v_writelane_b32 v32, s64, 12
-; GFX11-NEXT: v_readfirstlane_b32 s64, v10
-; GFX11-NEXT: v_writelane_b32 v32, s65, 13
-; GFX11-NEXT: v_readfirstlane_b32 s65, v11
; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
; GFX11-NEXT: ; %bb.1: ; %Flow
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
@@ -26656,7 +26654,6 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -26669,6 +26666,7 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr40
@@ -26922,7 +26920,6 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v15i64_to_v60i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -26935,6 +26932,7 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr57
@@ -27143,7 +27141,6 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v15i64_to_v60i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -27156,6 +27153,7 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -27599,8 +27597,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_writelane_b32 v30, s54, 12
; SI-NEXT: v_writelane_b32 v30, s55, 13
; SI-NEXT: v_writelane_b32 v30, s64, 14
-; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_writelane_b32 v30, s30, 15
+; SI-NEXT: v_writelane_b32 v30, s31, 16
+; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_readfirstlane_b32 s5, v15
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_readfirstlane_b32 s7, v13
@@ -27618,7 +27617,6 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: s_cmp_lg_u32 s42, 0
; SI-NEXT: v_readfirstlane_b32 s42, v0
-; SI-NEXT: v_writelane_b32 v30, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s34, s5, 16
@@ -27901,8 +27899,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_writelane_b32 v30, s37, 3
; VI-NEXT: v_writelane_b32 v30, s38, 4
; VI-NEXT: v_writelane_b32 v30, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v30, s30, 6
+; VI-NEXT: v_writelane_b32 v30, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_readfirstlane_b32 s7, v14
; VI-NEXT: v_readfirstlane_b32 s8, v13
@@ -27920,7 +27919,6 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s44, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s45, v0
-; VI-NEXT: v_writelane_b32 v30, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB41_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s46, s6, 16
@@ -28190,8 +28188,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v30, s34, 0
; GFX9-NEXT: v_writelane_b32 v30, s35, 1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v30, s30, 2
+; GFX9-NEXT: v_writelane_b32 v30, s31, 3
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_readfirstlane_b32 s6, v15
; GFX9-NEXT: v_readfirstlane_b32 s7, v14
; GFX9-NEXT: v_readfirstlane_b32 s8, v13
@@ -28209,7 +28208,6 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s45, v0
-; GFX9-NEXT: v_writelane_b32 v30, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB41_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s6, 16
@@ -30409,20 +30407,20 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; VI-NEXT: v_writelane_b32 v30, s83, 25
; VI-NEXT: v_writelane_b32 v30, s84, 26
; VI-NEXT: v_writelane_b32 v30, s85, 27
-; VI-NEXT: v_readfirstlane_b32 s11, v13
; VI-NEXT: v_writelane_b32 v30, s86, 28
+; VI-NEXT: v_writelane_b32 v30, s87, 29
+; VI-NEXT: v_writelane_b32 v30, s30, 30
+; VI-NEXT: v_writelane_b32 v30, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s11, v13
; VI-NEXT: s_lshr_b32 s63, s11, 16
; VI-NEXT: v_readfirstlane_b32 s13, v12
; VI-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v30, s87, 29
; VI-NEXT: s_lshr_b32 s62, s13, 16
; VI-NEXT: v_readfirstlane_b32 s15, v11
; VI-NEXT: v_writelane_b32 v31, s63, 0
-; VI-NEXT: v_writelane_b32 v30, s30, 30
; VI-NEXT: s_lshr_b32 s61, s15, 16
; VI-NEXT: v_readfirstlane_b32 s73, v10
; VI-NEXT: v_writelane_b32 v31, s62, 1
-; VI-NEXT: v_writelane_b32 v30, s31, 31
; VI-NEXT: v_readfirstlane_b32 s7, v15
; VI-NEXT: v_readfirstlane_b32 s9, v14
; VI-NEXT: s_lshr_b32 s60, s73, 16
@@ -30812,6 +30810,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_writelane_b32 v32, s64, 12
+; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_lshr_b32 s10, s23, 16
; GFX9-NEXT: s_lshr_b32 s11, s22, 16
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
@@ -30820,7 +30821,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s72, v15
; GFX9-NEXT: v_readfirstlane_b32 s74, v14
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
@@ -30845,7 +30845,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s11, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
; GFX9-NEXT: v_readfirstlane_b32 s10, v0
-; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -30869,7 +30868,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s22, s11, 16
; GFX9-NEXT: s_lshr_b32 s23, s10, 16
; GFX9-NEXT: v_readfirstlane_b32 s44, v16
-; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s44, s24, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s45, s25, s8
@@ -31139,7 +31137,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -31152,6 +31149,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr40
@@ -31405,7 +31403,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v15i64_to_v60f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -31418,6 +31415,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr57
@@ -31626,7 +31624,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v15i64_to_v60f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -31639,6 +31636,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -32082,8 +32080,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: v_writelane_b32 v30, s54, 12
; SI-NEXT: v_writelane_b32 v30, s55, 13
; SI-NEXT: v_writelane_b32 v30, s64, 14
-; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_writelane_b32 v30, s30, 15
+; SI-NEXT: v_writelane_b32 v30, s31, 16
+; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_readfirstlane_b32 s5, v15
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: v_readfirstlane_b32 s7, v13
@@ -32101,7 +32100,6 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: s_cmp_lg_u32 s42, 0
; SI-NEXT: v_readfirstlane_b32 s42, v0
-; SI-NEXT: v_writelane_b32 v30, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s34, s5, 16
@@ -32384,8 +32382,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: v_writelane_b32 v30, s37, 3
; VI-NEXT: v_writelane_b32 v30, s38, 4
; VI-NEXT: v_writelane_b32 v30, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v30, s30, 6
+; VI-NEXT: v_writelane_b32 v30, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_readfirstlane_b32 s7, v14
; VI-NEXT: v_readfirstlane_b32 s8, v13
@@ -32403,7 +32402,6 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s44, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s45, v0
-; VI-NEXT: v_writelane_b32 v30, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB45_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s46, s6, 16
@@ -32673,8 +32671,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v30, s34, 0
; GFX9-NEXT: v_writelane_b32 v30, s35, 1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v30, s30, 2
+; GFX9-NEXT: v_writelane_b32 v30, s31, 3
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_readfirstlane_b32 s6, v15
; GFX9-NEXT: v_readfirstlane_b32 s7, v14
; GFX9-NEXT: v_readfirstlane_b32 s8, v13
@@ -32692,7 +32691,6 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s44, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s45, v0
-; GFX9-NEXT: v_writelane_b32 v30, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB45_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s6, 16
@@ -35128,28 +35126,28 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; VI-NEXT: v_writelane_b32 v32, s81, 23
; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_writelane_b32 v32, s83, 25
-; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_writelane_b32 v32, s84, 26
+; VI-NEXT: v_writelane_b32 v32, s85, 27
+; VI-NEXT: v_writelane_b32 v32, s86, 28
+; VI-NEXT: v_writelane_b32 v32, s87, 29
+; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: s_lshr_b32 vcc_lo, s6, 16
; VI-NEXT: v_readfirstlane_b32 s8, v14
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: s_lshr_b32 vcc_hi, s8, 16
; VI-NEXT: v_readfirstlane_b32 s10, v13
; VI-NEXT: v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: s_lshr_b32 s63, s10, 16
; VI-NEXT: v_readfirstlane_b32 s12, v12
; VI-NEXT: v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT: v_writelane_b32 v32, s87, 29
; VI-NEXT: s_lshr_b32 s62, s12, 16
; VI-NEXT: v_readfirstlane_b32 s14, v11
; VI-NEXT: v_writelane_b32 v33, s63, 2
-; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: s_lshr_b32 s61, s14, 16
; VI-NEXT: v_readfirstlane_b32 s72, v10
; VI-NEXT: v_writelane_b32 v33, s62, 3
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: s_lshr_b32 s60, s72, 16
; VI-NEXT: v_readfirstlane_b32 s74, v9
; VI-NEXT: v_readfirstlane_b32 s76, v8
@@ -35516,6 +35514,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_writelane_b32 v32, s64, 12
+; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_lshr_b32 s10, s23, 16
; GFX9-NEXT: s_lshr_b32 s11, s22, 16
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
@@ -35524,7 +35525,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s72, v15
; GFX9-NEXT: v_readfirstlane_b32 s74, v14
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
@@ -35549,7 +35549,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s11, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
; GFX9-NEXT: v_readfirstlane_b32 s10, v0
-; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -35573,7 +35572,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s22, s11, 16
; GFX9-NEXT: s_lshr_b32 s23, s10, 16
; GFX9-NEXT: v_readfirstlane_b32 s44, v16
-; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s44, s24, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s45, s25, s8
@@ -35844,7 +35842,6 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -35857,6 +35854,7 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr40
@@ -36095,7 +36093,6 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v15f64_to_v60i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -36108,6 +36105,7 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr57
@@ -36301,7 +36299,6 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v15f64_to_v60i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -36314,6 +36311,7 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -36680,7 +36678,20 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v62, s34, 0
; SI-NEXT: v_writelane_b32 v62, s35, 1
; SI-NEXT: v_writelane_b32 v62, s36, 2
@@ -36696,8 +36707,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; SI-NEXT: v_writelane_b32 v62, s54, 12
; SI-NEXT: v_writelane_b32 v62, s55, 13
; SI-NEXT: v_writelane_b32 v62, s64, 14
-; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_writelane_b32 v62, s30, 15
+; SI-NEXT: v_writelane_b32 v62, s31, 16
+; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_readfirstlane_b32 s15, v15
; SI-NEXT: v_readfirstlane_b32 s14, v14
; SI-NEXT: v_readfirstlane_b32 s41, v13
@@ -36715,21 +36727,6 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: s_cmp_lg_u32 s42, 0
; SI-NEXT: v_readfirstlane_b32 s42, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v62, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB49_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s64, s15, 16
@@ -37051,14 +37048,27 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v60, s34, 0
; VI-NEXT: v_writelane_b32 v60, s35, 1
; VI-NEXT: v_writelane_b32 v60, s36, 2
; VI-NEXT: v_writelane_b32 v60, s37, 3
; VI-NEXT: v_writelane_b32 v60, s38, 4
; VI-NEXT: v_writelane_b32 v60, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v60, s30, 6
+; VI-NEXT: v_writelane_b32 v60, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_readfirstlane_b32 s9, v15
; VI-NEXT: v_readfirstlane_b32 s8, v14
; VI-NEXT: v_readfirstlane_b32 s11, v13
@@ -37076,19 +37086,6 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; VI-NEXT: v_readfirstlane_b32 s5, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: v_writelane_b32 v60, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB49_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s46, s9, 16
@@ -37355,10 +37352,23 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v60, s34, 0
; GFX9-NEXT: v_writelane_b32 v60, s35, 1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v60, s30, 2
+; GFX9-NEXT: v_writelane_b32 v60, s31, 3
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_readfirstlane_b32 s9, v15
; GFX9-NEXT: v_readfirstlane_b32 s8, v14
; GFX9-NEXT: v_readfirstlane_b32 s11, v13
@@ -37376,19 +37386,6 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v60, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB49_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s9, 16
@@ -39875,20 +39872,20 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; VI-NEXT: v_writelane_b32 v30, s83, 25
; VI-NEXT: v_writelane_b32 v30, s84, 26
; VI-NEXT: v_writelane_b32 v30, s85, 27
-; VI-NEXT: v_readfirstlane_b32 s11, v13
; VI-NEXT: v_writelane_b32 v30, s86, 28
+; VI-NEXT: v_writelane_b32 v30, s87, 29
+; VI-NEXT: v_writelane_b32 v30, s30, 30
+; VI-NEXT: v_writelane_b32 v30, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s11, v13
; VI-NEXT: s_lshr_b32 s63, s11, 16
; VI-NEXT: v_readfirstlane_b32 s13, v12
; VI-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v30, s87, 29
; VI-NEXT: s_lshr_b32 s62, s13, 16
; VI-NEXT: v_readfirstlane_b32 s15, v11
; VI-NEXT: v_writelane_b32 v31, s63, 0
-; VI-NEXT: v_writelane_b32 v30, s30, 30
; VI-NEXT: s_lshr_b32 s61, s15, 16
; VI-NEXT: v_readfirstlane_b32 s73, v10
; VI-NEXT: v_writelane_b32 v31, s62, 1
-; VI-NEXT: v_writelane_b32 v30, s31, 31
; VI-NEXT: v_readfirstlane_b32 s7, v15
; VI-NEXT: v_readfirstlane_b32 s9, v14
; VI-NEXT: s_lshr_b32 s60, s73, 16
@@ -40278,6 +40275,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_writelane_b32 v32, s64, 12
+; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_lshr_b32 s10, s23, 16
; GFX9-NEXT: s_lshr_b32 s11, s22, 16
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
@@ -40286,7 +40286,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s72, v15
; GFX9-NEXT: v_readfirstlane_b32 s74, v14
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
@@ -40311,7 +40310,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: v_readfirstlane_b32 s11, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
; GFX9-NEXT: v_readfirstlane_b32 s10, v0
-; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -40335,7 +40333,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: s_lshr_b32 s22, s11, 16
; GFX9-NEXT: s_lshr_b32 s23, s10, 16
; GFX9-NEXT: v_readfirstlane_b32 s44, v16
-; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s44, s24, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s45, s25, s8
@@ -40605,7 +40602,6 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -40618,6 +40614,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr40
@@ -40856,7 +40853,6 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
; VI-LABEL: bitcast_v15f64_to_v60f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -40869,6 +40865,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr57
@@ -41062,7 +41059,6 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
; GFX9-LABEL: bitcast_v15f64_to_v60f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -41075,6 +41071,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -41441,7 +41438,20 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v62, s34, 0
; SI-NEXT: v_writelane_b32 v62, s35, 1
; SI-NEXT: v_writelane_b32 v62, s36, 2
@@ -41457,8 +41467,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: v_writelane_b32 v62, s54, 12
; SI-NEXT: v_writelane_b32 v62, s55, 13
; SI-NEXT: v_writelane_b32 v62, s64, 14
-; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_writelane_b32 v62, s30, 15
+; SI-NEXT: v_writelane_b32 v62, s31, 16
+; SI-NEXT: v_readfirstlane_b32 s42, v16
; SI-NEXT: v_readfirstlane_b32 s15, v15
; SI-NEXT: v_readfirstlane_b32 s14, v14
; SI-NEXT: v_readfirstlane_b32 s41, v13
@@ -41476,21 +41487,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: s_cmp_lg_u32 s42, 0
; SI-NEXT: v_readfirstlane_b32 s42, v0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v62, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB53_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s64, s15, 16
@@ -41812,14 +41808,27 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v60, s34, 0
; VI-NEXT: v_writelane_b32 v60, s35, 1
; VI-NEXT: v_writelane_b32 v60, s36, 2
; VI-NEXT: v_writelane_b32 v60, s37, 3
; VI-NEXT: v_writelane_b32 v60, s38, 4
; VI-NEXT: v_writelane_b32 v60, s39, 5
-; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_writelane_b32 v60, s30, 6
+; VI-NEXT: v_writelane_b32 v60, s31, 7
+; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: v_readfirstlane_b32 s9, v15
; VI-NEXT: v_readfirstlane_b32 s8, v14
; VI-NEXT: v_readfirstlane_b32 s11, v13
@@ -41837,19 +41846,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; VI-NEXT: v_readfirstlane_b32 s5, v1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_readfirstlane_b32 s4, v0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: v_writelane_b32 v60, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB53_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s46, s9, 16
@@ -42116,10 +42112,23 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v60, s34, 0
; GFX9-NEXT: v_writelane_b32 v60, s35, 1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_writelane_b32 v60, s30, 2
+; GFX9-NEXT: v_writelane_b32 v60, s31, 3
+; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: v_readfirstlane_b32 s9, v15
; GFX9-NEXT: v_readfirstlane_b32 s8, v14
; GFX9-NEXT: v_readfirstlane_b32 s11, v13
@@ -42137,19 +42146,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v60, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB53_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s9, 16
@@ -44872,28 +44868,28 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; VI-NEXT: v_writelane_b32 v32, s81, 23
; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_writelane_b32 v32, s83, 25
-; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: v_writelane_b32 v32, s84, 26
+; VI-NEXT: v_writelane_b32 v32, s85, 27
+; VI-NEXT: v_writelane_b32 v32, s86, 28
+; VI-NEXT: v_writelane_b32 v32, s87, 29
+; VI-NEXT: v_writelane_b32 v32, s30, 30
+; VI-NEXT: v_writelane_b32 v32, s31, 31
+; VI-NEXT: v_readfirstlane_b32 s6, v15
; VI-NEXT: s_lshr_b32 vcc_lo, s6, 16
; VI-NEXT: v_readfirstlane_b32 s8, v14
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: s_lshr_b32 vcc_hi, s8, 16
; VI-NEXT: v_readfirstlane_b32 s10, v13
; VI-NEXT: v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: s_lshr_b32 s63, s10, 16
; VI-NEXT: v_readfirstlane_b32 s12, v12
; VI-NEXT: v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT: v_writelane_b32 v32, s87, 29
; VI-NEXT: s_lshr_b32 s62, s12, 16
; VI-NEXT: v_readfirstlane_b32 s14, v11
; VI-NEXT: v_writelane_b32 v33, s63, 2
-; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: s_lshr_b32 s61, s14, 16
; VI-NEXT: v_readfirstlane_b32 s72, v10
; VI-NEXT: v_writelane_b32 v33, s62, 3
-; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: s_lshr_b32 s60, s72, 16
; VI-NEXT: v_readfirstlane_b32 s74, v9
; VI-NEXT: v_readfirstlane_b32 s76, v8
@@ -45260,6 +45256,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX9-NEXT: v_writelane_b32 v32, s52, 8
; GFX9-NEXT: v_writelane_b32 v32, s53, 9
; GFX9-NEXT: v_writelane_b32 v32, s54, 10
+; GFX9-NEXT: v_writelane_b32 v32, s55, 11
+; GFX9-NEXT: v_writelane_b32 v32, s64, 12
+; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_lshr_b32 s10, s23, 16
; GFX9-NEXT: s_lshr_b32 s11, s22, 16
; GFX9-NEXT: s_lshr_b32 s12, s21, 16
@@ -45268,7 +45267,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX9-NEXT: s_lshr_b32 s15, s18, 16
; GFX9-NEXT: s_lshr_b32 s40, s17, 16
; GFX9-NEXT: s_lshr_b32 s41, s16, 16
-; GFX9-NEXT: v_writelane_b32 v32, s55, 11
; GFX9-NEXT: v_readfirstlane_b32 s72, v15
; GFX9-NEXT: v_readfirstlane_b32 s74, v14
; GFX9-NEXT: v_readfirstlane_b32 s63, v13
@@ -45293,7 +45291,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s11, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s23, s10
; GFX9-NEXT: v_readfirstlane_b32 s10, v0
-; GFX9-NEXT: v_writelane_b32 v32, s64, 12
; GFX9-NEXT: s_lshr_b32 s4, s29, 16
; GFX9-NEXT: s_lshr_b32 s5, s28, 16
; GFX9-NEXT: s_lshr_b32 s6, s27, 16
@@ -45317,7 +45314,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX9-NEXT: s_lshr_b32 s22, s11, 16
; GFX9-NEXT: s_lshr_b32 s23, s10, 16
; GFX9-NEXT: v_readfirstlane_b32 s44, v16
-; GFX9-NEXT: v_writelane_b32 v32, s65, 13
; GFX9-NEXT: s_cmp_lg_u32 s44, 0
; GFX9-NEXT: s_pack_ll_b32_b16 s44, s24, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s45, s25, s8
@@ -45588,7 +45584,6 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -45605,6 +45600,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) #0 {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v48
; SI-NEXT: ; implicit-def: $vgpr48
@@ -47094,26 +47090,27 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: v_writelane_b32 v30, s83, 25
; SI-NEXT: v_writelane_b32 v30, s84, 26
; SI-NEXT: v_writelane_b32 v30, s85, 27
+; SI-NEXT: v_writelane_b32 v30, s86, 28
+; SI-NEXT: v_writelane_b32 v30, s87, 29
+; SI-NEXT: v_writelane_b32 v30, s96, 30
+; SI-NEXT: v_writelane_b32 v30, s97, 31
+; SI-NEXT: v_writelane_b32 v30, s98, 32
+; SI-NEXT: v_writelane_b32 v30, s99, 33
+; SI-NEXT: v_writelane_b32 v30, s30, 34
+; SI-NEXT: v_writelane_b32 v30, s31, 35
; SI-NEXT: s_lshr_b32 s4, s19, 16
; SI-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v30, s86, 28
+; SI-NEXT: v_readfirstlane_b32 s75, v15
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v31, s4, 0
; SI-NEXT: s_lshr_b32 s4, s18, 16
-; SI-NEXT: v_writelane_b32 v30, s87, 29
; SI-NEXT: v_writelane_b32 v31, s4, 1
-; SI-NEXT: v_writelane_b32 v30, s96, 30
; SI-NEXT: v_writelane_b32 v31, s17, 2
; SI-NEXT: s_lshr_b32 s4, s17, 16
-; SI-NEXT: v_writelane_b32 v30, s97, 31
; SI-NEXT: v_writelane_b32 v31, s4, 3
-; SI-NEXT: v_writelane_b32 v30, s98, 32
; SI-NEXT: v_writelane_b32 v31, s16, 4
; SI-NEXT: s_lshr_b32 s4, s16, 16
-; SI-NEXT: v_writelane_b32 v30, s99, 33
; SI-NEXT: v_writelane_b32 v31, s4, 5
-; SI-NEXT: v_writelane_b32 v30, s30, 34
-; SI-NEXT: v_readfirstlane_b32 s75, v15
; SI-NEXT: v_readfirstlane_b32 s70, v14
; SI-NEXT: v_readfirstlane_b32 s17, v13
; SI-NEXT: v_readfirstlane_b32 s48, v12
@@ -47130,7 +47127,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s97, v1
; SI-NEXT: v_readfirstlane_b32 s99, v0
; SI-NEXT: v_writelane_b32 v31, s18, 6
-; SI-NEXT: v_writelane_b32 v30, s31, 35
; SI-NEXT: s_mov_b32 s85, s21
; SI-NEXT: s_lshr_b32 s79, s29, 16
; SI-NEXT: s_lshr_b32 s92, s28, 16
@@ -48084,6 +48080,18 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v60, s34, 0
; GFX9-NEXT: v_writelane_b32 v60, s35, 1
; GFX9-NEXT: v_writelane_b32 v60, s30, 2
@@ -48136,18 +48144,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s44, s76, 16
; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB57_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_cbranch_execnz .LBB57_4
@@ -49891,7 +49887,21 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: v_writelane_b32 v63, s34, 0
; SI-NEXT: v_writelane_b32 v63, s35, 1
; SI-NEXT: v_writelane_b32 v63, s30, 2
@@ -49944,21 +49954,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s92, s30, 16
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_cmp_lg_u32 s4, 0
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB59_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB59_4
@@ -50466,6 +50461,18 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: v_writelane_b32 v60, s34, 0
; VI-NEXT: v_writelane_b32 v60, s35, 1
; VI-NEXT: v_writelane_b32 v60, s36, 2
@@ -50522,18 +50529,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; VI-NEXT: s_lshr_b32 s39, s38, 16
; VI-NEXT: v_readfirstlane_b32 s4, v16
; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB59_3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB59_4
@@ -50756,6 +50751,18 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v60, s34, 0
; GFX9-NEXT: v_writelane_b32 v60, s35, 1
; GFX9-NEXT: v_writelane_b32 v60, s30, 2
@@ -50808,18 +50815,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s44, s76, 16
; GFX9-NEXT: v_readfirstlane_b32 s4, v16
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB59_3
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_cbranch_execnz .LBB59_4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index ad53729f0ae6c..8dca9859c6786 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -424,6 +424,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX11-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GISEL-GFX11-NEXT: scratch_store_b32 off, v11, s32 offset:4
+; GISEL-GFX11-NEXT: ; meta instruction
; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32
; GISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8
; GISEL-GFX11-NEXT: s_mov_b32 s3, s0
@@ -465,6 +466,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX11-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v11, s32 offset:4
+; DAGISEL-GFX11-NEXT: ; meta instruction
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32
; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8
; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0
@@ -587,33 +589,34 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_realign_stack:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: s_add_i32 s33, s32, 31
+; GISEL-GFX11-NEXT: s_mov_b32 s34, s32
+; GISEL-GFX11-NEXT: s_and_not1_b32 s33, s33, 31
+; GISEL-GFX11-NEXT: s_addk_i32 s32, 0xe0
+; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s33 ; 4-byte Folded Spill
; GISEL-GFX11-NEXT: s_mov_b32 s7, 4
; GISEL-GFX11-NEXT: s_mov_b32 s6, 3
; GISEL-GFX11-NEXT: s_mov_b32 s5, 2
; GISEL-GFX11-NEXT: s_mov_b32 s4, 1
-; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 4, v8
-; GISEL-GFX11-NEXT: s_add_i32 s33, s32, 31
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v4, s7
; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
-; GISEL-GFX11-NEXT: s_and_not1_b32 s33, s33, 31
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_add_nc_u32 v5, s33, v0
-; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s33 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v5, s33, v0
; GISEL-GFX11-NEXT: s_mov_b32 s3, s0
; GISEL-GFX11-NEXT: ;;#ASMSTART
; GISEL-GFX11-NEXT: s_nop
; GISEL-GFX11-NEXT: ;;#ASMEND
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_add_nc_u32 v5, 32, v5
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11
-; GISEL-GFX11-NEXT: s_mov_b32 s34, s32
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v10, v11 :: v_dual_add_nc_u32 v5, 32, v5
; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee at abs32@lo
-; GISEL-GFX11-NEXT: scratch_store_b128 v5, v[1:4], off dlc
-; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s33 ; 4-byte Folded Reload
; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee at abs32@hi
; GISEL-GFX11-NEXT: s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT: s_addk_i32 s32, 0xe0
; GISEL-GFX11-NEXT: s_mov_b32 s32, s34
+; GISEL-GFX11-NEXT: scratch_store_b128 v5, v[1:4], off dlc
+; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s33 ; 4-byte Folded Reload
; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1
; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5]
;
@@ -621,13 +624,15 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
; GISEL-GFX10: ; %bb.0:
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
-; GISEL-GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v8
+; GISEL-GFX10-NEXT: s_mov_b32 s34, s32
; GISEL-GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 1
+; GISEL-GFX10-NEXT: s_addk_i32 s32, 0x1c00
+; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s33 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v8
; GISEL-GFX10-NEXT: v_lshrrev_b32_e64 v3, 5, s33
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 1
; GISEL-GFX10-NEXT: v_mov_b32_e32 v4, 3
; GISEL-GFX10-NEXT: v_mov_b32_e32 v5, 4
-; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s33 ; 4-byte Folded Spill
; GISEL-GFX10-NEXT: s_mov_b32 s3, s0
; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v2, v0, v3
; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, 2
@@ -638,7 +643,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
; GISEL-GFX10-NEXT: v_mov_b32_e32 v9, v10
; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2
; GISEL-GFX10-NEXT: v_mov_b32_e32 v10, v11
-; GISEL-GFX10-NEXT: s_mov_b32 s34, s32
; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee at abs32@lo
; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee at abs32@hi
; GISEL-GFX10-NEXT: buffer_store_dword v1, v2, s[48:51], 0 offen
@@ -651,7 +655,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s33 ; 4-byte Folded Reload
; GISEL-GFX10-NEXT: s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT: s_addk_i32 s32, 0x1c00
; GISEL-GFX10-NEXT: s_mov_b32 s32, s34
; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
@@ -663,20 +666,20 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
; DAGISEL-GFX11-NEXT: s_mov_b32 s34, s32
; DAGISEL-GFX11-NEXT: s_and_not1_b32 s33, s33, 31
; DAGISEL-GFX11-NEXT: s_addk_i32 s32, 0xe0
+; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s33 ; 4-byte Folded Spill
; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0
; DAGISEL-GFX11-NEXT: s_add_i32 s0, s33, 32
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v5, v8, 4, s0
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
-; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s33 ; 4-byte Folded Spill
; DAGISEL-GFX11-NEXT: ;;#ASMSTART
; DAGISEL-GFX11-NEXT: s_nop
; DAGISEL-GFX11-NEXT: ;;#ASMEND
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10
+; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v10, v11
; DAGISEL-GFX11-NEXT: scratch_store_b128 v5, v[1:4], off dlc
; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s33 ; 4-byte Folded Reload
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v10, v11
; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee at abs32@hi
; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee at abs32@lo
; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3
@@ -688,19 +691,20 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
; DAGISEL-GFX10: ; %bb.0:
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 4
+; DAGISEL-GFX10-NEXT: s_mov_b32 s34, s32
; DAGISEL-GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3
+; DAGISEL-GFX10-NEXT: s_addk_i32 s32, 0x1c00
+; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s33 ; 4-byte Folded Spill
; DAGISEL-GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
+; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 4
+; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v4, 2
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v5, 1
-; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s33 ; 4-byte Folded Spill
-; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0
; DAGISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, 32, v0
-; DAGISEL-GFX10-NEXT: s_mov_b32 s34, s32
+; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0
; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee at abs32@hi
; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee at abs32@lo
-; DAGISEL-GFX10-NEXT: s_addk_i32 s32, 0x1c00
+; DAGISEL-GFX10-NEXT: s_mov_b32 s32, s34
; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, v0
; DAGISEL-GFX10-NEXT: ;;#ASMSTART
; DAGISEL-GFX10-NEXT: s_nop
@@ -718,7 +722,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_realign_stack(i32
; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s33 ; 4-byte Folded Reload
; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3
-; DAGISEL-GFX10-NEXT: s_mov_b32 s32, s34
; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index a4ed384842956..20d421abf6a14 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -119,32 +119,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) #0 {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
+; CHECK-NEXT: s_addk_i32 s32, 0x800
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v43, s34, 0
; CHECK-NEXT: v_writelane_b32 v43, s35, 1
; CHECK-NEXT: v_writelane_b32 v43, s36, 2
; CHECK-NEXT: v_writelane_b32 v43, s37, 3
; CHECK-NEXT: v_writelane_b32 v43, s38, 4
; CHECK-NEXT: v_writelane_b32 v43, s39, 5
-; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v43, s48, 6
; CHECK-NEXT: v_writelane_b32 v43, s49, 7
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s51, 9
-; CHECK-NEXT: v_writelane_b32 v43, s52, 10
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
-; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v2
; CHECK-NEXT: s_mov_b32 s50, s15
@@ -259,30 +259,30 @@ define double @test_powr_fast_f64(double %x, double %y) #0 {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
+; CHECK-NEXT: s_addk_i32 s32, 0x800
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v43, s34, 0
; CHECK-NEXT: v_writelane_b32 v43, s35, 1
; CHECK-NEXT: v_writelane_b32 v43, s36, 2
; CHECK-NEXT: v_writelane_b32 v43, s37, 3
; CHECK-NEXT: v_writelane_b32 v43, s38, 4
; CHECK-NEXT: v_writelane_b32 v43, s39, 5
-; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v43, s48, 6
; CHECK-NEXT: v_writelane_b32 v43, s49, 7
-; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT: s_getpc_b64 s[4:5]
-; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
; CHECK-NEXT: v_writelane_b32 v43, s50, 8
-; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v43, s51, 9
; CHECK-NEXT: v_writelane_b32 v43, s52, 10
; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: v_writelane_b32 v43, s30, 12
-; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v43, s31, 13
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: v_mov_b32_e32 v42, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v3
; CHECK-NEXT: v_mov_b32_e32 v40, v2
@@ -401,32 +401,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) #0 {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
+; CHECK-NEXT: s_addk_i32 s32, 0x800
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v43, s34, 0
; CHECK-NEXT: v_writelane_b32 v43, s35, 1
; CHECK-NEXT: v_writelane_b32 v43, s36, 2
; CHECK-NEXT: v_writelane_b32 v43, s37, 3
; CHECK-NEXT: v_writelane_b32 v43, s38, 4
; CHECK-NEXT: v_writelane_b32 v43, s39, 5
-; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v43, s48, 6
; CHECK-NEXT: v_writelane_b32 v43, s49, 7
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s51, 9
-; CHECK-NEXT: v_writelane_b32 v43, s52, 10
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
-; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v2
; CHECK-NEXT: s_mov_b32 s50, s15
@@ -543,30 +543,30 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) #0 {
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v42, s16, 14
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v42, s34, 0
; CHECK-NEXT: v_writelane_b32 v42, s35, 1
; CHECK-NEXT: v_writelane_b32 v42, s36, 2
; CHECK-NEXT: v_writelane_b32 v42, s37, 3
; CHECK-NEXT: v_writelane_b32 v42, s38, 4
; CHECK-NEXT: v_writelane_b32 v42, s39, 5
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v42, s48, 6
; CHECK-NEXT: v_writelane_b32 v42, s49, 7
-; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT: s_getpc_b64 s[4:5]
-; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
; CHECK-NEXT: v_writelane_b32 v42, s50, 8
-; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v42, s51, 9
; CHECK-NEXT: v_writelane_b32 v42, s52, 10
; CHECK-NEXT: v_writelane_b32 v42, s53, 11
; CHECK-NEXT: v_writelane_b32 v42, s30, 12
+; CHECK-NEXT: v_writelane_b32 v42, s31, 13
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v42, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -684,6 +684,10 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) #0 {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 15
+; CHECK-NEXT: s_addk_i32 s32, 0x800
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v43, s34, 0
; CHECK-NEXT: v_writelane_b32 v43, s35, 1
; CHECK-NEXT: v_writelane_b32 v43, s36, 2
@@ -691,26 +695,22 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) #0 {
; CHECK-NEXT: v_writelane_b32 v43, s38, 4
; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: v_writelane_b32 v43, s48, 6
-; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: v_writelane_b32 v43, s50, 8
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
+; CHECK-NEXT: v_writelane_b32 v43, s54, 12
+; CHECK-NEXT: v_writelane_b32 v43, s30, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 14
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v43, s51, 9
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s52, 10
-; CHECK-NEXT: v_writelane_b32 v43, s53, 11
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s54, 12
; CHECK-NEXT: v_mov_b32_e32 v41, v1
-; CHECK-NEXT: v_writelane_b32 v43, s30, 13
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s31, 14
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 4253ed138a8cb..ca7e358fc0314 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -205,13 +205,13 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
+; GFX8-NEXT: v_writelane_b32 v3, s30, 0
; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: v_writelane_b32 v3, s31, 1
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX8-NEXT: v_writelane_b32 v3, s30, 0
-; GFX8-NEXT: v_writelane_b32 v3, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_readlane_b32 s30, v3, 0
@@ -233,13 +233,13 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8
; GFX8-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s3 ; 4-byte Folded Spill
; GFX8-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
; GFX8-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16
+; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1]
; GFX8-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
-; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
@@ -261,13 +261,13 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
+; GFX9-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
@@ -288,13 +288,13 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX9-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s33 ; 4-byte Folded Spill
; GFX9-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16
+; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1]
; GFX9-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
@@ -315,14 +315,14 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s33 ; 4-byte Folded Spill
; GFX942-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
; GFX942-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16
+; GFX942-ARCH-FLAT-NEXT: s_nop 0
+; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1]
; GFX942-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
-; GFX942-ARCH-FLAT-NEXT: s_nop 1
-; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
@@ -344,13 +344,13 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index b7f13a65745f9..23501793d3afd 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -496,17 +496,29 @@ body: |
; GCN-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr40, 4352
; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 2816
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 2560
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 2304
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec, 64, 2048
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 1792
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec, 64, 1536
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec, 64, 1280
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec, 64, 1024
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec, 64, 768
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec, 64, 512
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec, 64, 256
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec, 64, 0
; GCN-NEXT: renamable $vgpr44 = COPY $vgpr13, implicit $exec
; GCN-NEXT: renamable $vgpr43 = COPY $vgpr12, implicit $exec
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit undef $scc
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 53f67834a7f6f..5af889f3697b5 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -3333,13 +3333,13 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) #0 {
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
+; GFX7-NEXT: v_writelane_b32 v2, s30, 0
; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: v_writelane_b32 v2, s31, 1
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store at gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store at gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX7-NEXT: v_writelane_b32 v2, s30, 0
-; GFX7-NEXT: v_writelane_b32 v2, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_readlane_b32 s30, v2, 0
@@ -3362,13 +3362,13 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) #0 {
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
+; GFX8-NEXT: v_writelane_b32 v2, s30, 0
; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store at gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX8-NEXT: v_writelane_b32 v2, s30, 0
-; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
@@ -3391,13 +3391,13 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) #0 {
; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: v_writelane_b32 v2, s30, 0
; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: v_writelane_b32 v2, s31, 1
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store at gotpcrel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store at gotpcrel32@hi+12
; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX900-NEXT: v_writelane_b32 v2, s30, 0
-; GFX900-NEXT: v_writelane_b32 v2, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: v_readlane_b32 s30, v2, 0
@@ -3420,14 +3420,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) #0 {
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill
; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: v_writelane_b32 v4, s30, 0
; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v4, s31, 1
; GFX950-NEXT: s_getpc_b64 s[0:1]
; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store at gotpcrel32@lo+4
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX950-NEXT: v_writelane_b32 v4, s30, 0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_writelane_b32 v4, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX950-NEXT: v_readlane_b32 s30, v4, 0
@@ -3451,13 +3451,13 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) #0 {
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: v_writelane_b32 v2, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v2, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
@@ -3481,16 +3481,15 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) #0 {
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v2, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: scratch_store_b16 v1, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3513,15 +3512,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) #0 {
; GFX1250-NEXT: scratch_store_b32 off, v4, s33 nv ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s0
-; GFX1250-NEXT: s_get_pc_i64 s[0:1]
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store at gotpcrel+4
; GFX1250-NEXT: v_writelane_b32 v4, s30, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_add_co_i32 s32, s32, 16
; GFX1250-NEXT: v_writelane_b32 v4, s31, 1
+; GFX1250-NEXT: s_get_pc_i64 s[0:1]
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store at gotpcrel+4
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: scratch_store_b16 v1, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -3580,13 +3578,13 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
+; GFX7-NEXT: v_writelane_b32 v2, s30, 0
; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: v_writelane_b32 v2, s31, 1
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX7-NEXT: v_writelane_b32 v2, s30, 0
-; GFX7-NEXT: v_writelane_b32 v2, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_readlane_b32 s30, v2, 0
@@ -3609,13 +3607,13 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
+; GFX8-NEXT: v_writelane_b32 v2, s30, 0
; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX8-NEXT: v_writelane_b32 v2, s30, 0
-; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
@@ -3638,13 +3636,13 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: v_writelane_b32 v2, s30, 0
; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: v_writelane_b32 v2, s31, 1
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX900-NEXT: v_writelane_b32 v2, s30, 0
-; GFX900-NEXT: v_writelane_b32 v2, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: v_readlane_b32 s30, v2, 0
@@ -3667,14 +3665,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill
; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: v_writelane_b32 v4, s30, 0
; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v4, s31, 1
; GFX950-NEXT: s_getpc_b64 s[0:1]
; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX950-NEXT: v_writelane_b32 v4, s30, 0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_writelane_b32 v4, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX950-NEXT: v_readlane_b32 s30, v4, 0
@@ -3698,13 +3696,13 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: v_writelane_b32 v2, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v2, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
@@ -3728,16 +3726,15 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v2, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: scratch_store_b32 v1, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3760,15 +3757,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX1250-NEXT: scratch_store_b32 off, v4, s33 nv ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s0
-; GFX1250-NEXT: s_get_pc_i64 s[0:1]
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
; GFX1250-NEXT: v_writelane_b32 v4, s30, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_add_co_i32 s32, s32, 16
; GFX1250-NEXT: v_writelane_b32 v4, s31, 1
+; GFX1250-NEXT: s_get_pc_i64 s[0:1]
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: scratch_store_b32 v1, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -3831,14 +3827,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
+; GFX7-NEXT: v_writelane_b32 v4, s30, 0
; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: v_writelane_b32 v4, s31, 1
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX7-NEXT: v_writelane_b32 v4, s30, 0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_writelane_b32 v4, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v2
@@ -3864,13 +3860,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
+; GFX8-NEXT: v_writelane_b32 v4, s30, 0
; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: v_writelane_b32 v4, s31, 1
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX8-NEXT: v_writelane_b32 v4, s30, 0
-; GFX8-NEXT: v_writelane_b32 v4, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
@@ -3896,13 +3892,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: v_writelane_b32 v3, s30, 0
; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: v_writelane_b32 v3, s31, 1
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX900-NEXT: v_writelane_b32 v3, s30, 0
-; GFX900-NEXT: v_writelane_b32 v3, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: v_readlane_b32 s30, v3, 0
@@ -3927,15 +3923,15 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: v_writelane_b32 v5, s30, 0
; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_getpc_b64 s[0:1]
; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX950-NEXT: v_writelane_b32 v5, s30, 0
; GFX950-NEXT: v_mov_b32_e32 v4, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX950-NEXT: v_readlane_b32 s30, v5, 0
@@ -3961,13 +3957,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
@@ -3993,16 +3989,15 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: v_writelane_b32 v3, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v3, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -4027,16 +4022,15 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX1250-NEXT: scratch_store_b32 off, v5, s33 nv ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s0
-; GFX1250-NEXT: s_get_pc_i64 s[0:1]
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
; GFX1250-NEXT: v_writelane_b32 v5, s30, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_add_co_i32 s32, s32, 16
-; GFX1250-NEXT: v_mov_b32_e32 v4, v2
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
+; GFX1250-NEXT: s_get_pc_i64 s[0:1]
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
+; GFX1250-NEXT: v_mov_b32_e32 v4, v2
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -4109,13 +4103,13 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
+; GFX7-NEXT: v_writelane_b32 v6, s30, 0
; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: v_writelane_b32 v6, s31, 1
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX7-NEXT: v_writelane_b32 v6, s30, 0
-; GFX7-NEXT: v_writelane_b32 v6, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v2
@@ -4149,13 +4143,13 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
+; GFX8-NEXT: v_writelane_b32 v4, s30, 0
; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: v_writelane_b32 v4, s31, 1
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX8-NEXT: v_writelane_b32 v4, s30, 0
-; GFX8-NEXT: v_writelane_b32 v4, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
@@ -4181,13 +4175,13 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: v_writelane_b32 v3, s30, 0
; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: v_writelane_b32 v3, s31, 1
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX900-NEXT: v_writelane_b32 v3, s30, 0
-; GFX900-NEXT: v_writelane_b32 v3, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: v_readlane_b32 s30, v3, 0
@@ -4212,15 +4206,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: v_writelane_b32 v5, s30, 0
; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_getpc_b64 s[0:1]
; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX950-NEXT: v_writelane_b32 v5, s30, 0
; GFX950-NEXT: v_mov_b32_e32 v4, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX950-NEXT: v_readlane_b32 s30, v5, 0
@@ -4244,13 +4238,13 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
@@ -4276,16 +4270,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: v_writelane_b32 v3, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v3, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -4308,16 +4301,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX1250-NEXT: scratch_store_b32 off, v5, s33 nv ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s0
-; GFX1250-NEXT: s_get_pc_i64 s[0:1]
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
; GFX1250-NEXT: v_writelane_b32 v5, s30, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_add_co_i32 s32, s32, 16
-; GFX1250-NEXT: v_mov_b32_e32 v4, v2
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
+; GFX1250-NEXT: s_get_pc_i64 s[0:1]
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
+; GFX1250-NEXT: v_mov_b32_e32 v4, v2
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -4401,13 +4393,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
+; GFX7-NEXT: v_writelane_b32 v10, s30, 0
; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: v_writelane_b32 v10, s31, 1
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX7-NEXT: v_writelane_b32 v10, s30, 0
-; GFX7-NEXT: v_writelane_b32 v10, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 12, v4
@@ -4455,13 +4447,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
+; GFX8-NEXT: v_writelane_b32 v6, s30, 0
; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: v_writelane_b32 v6, s31, 1
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX8-NEXT: v_writelane_b32 v6, s30, 0
-; GFX8-NEXT: v_writelane_b32 v6, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4
@@ -4493,13 +4485,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX900-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: v_writelane_b32 v5, s30, 0
; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: v_writelane_b32 v5, s31, 1
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX900-NEXT: v_writelane_b32 v5, s30, 0
-; GFX900-NEXT: v_writelane_b32 v5, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: v_readlane_b32 s30, v5, 0
@@ -4528,14 +4520,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: v_writelane_b32 v5, s30, 0
; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_getpc_b64 s[0:1]
; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX950-NEXT: v_writelane_b32 v5, s30, 0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX950-NEXT: v_readlane_b32 s30, v5, 0
@@ -4559,13 +4551,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: v_writelane_b32 v5, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v5, s31, 1
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v5, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v5, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: v_readlane_b32 s30, v5, 0
@@ -4595,16 +4587,15 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: v_writelane_b32 v5, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v5, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v5, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v5, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v5, 0
; GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -4627,15 +4618,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX1250-NEXT: scratch_store_b32 off, v5, s33 nv ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s0
-; GFX1250-NEXT: s_get_pc_i64 s[0:1]
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
; GFX1250-NEXT: v_writelane_b32 v5, s30, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_add_co_i32 s32, s32, 16
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
+; GFX1250-NEXT: s_get_pc_i64 s[0:1]
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -4747,13 +4737,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
+; GFX7-NEXT: v_writelane_b32 v18, s30, 0
; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: v_writelane_b32 v18, s31, 1
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX7-NEXT: v_writelane_b32 v18, s30, 0
-; GFX7-NEXT: v_writelane_b32 v18, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 28, v8
@@ -4829,13 +4819,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
+; GFX8-NEXT: v_writelane_b32 v10, s30, 0
; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: v_writelane_b32 v10, s31, 1
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX8-NEXT: v_writelane_b32 v10, s30, 0
-; GFX8-NEXT: v_writelane_b32 v10, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8
@@ -4879,13 +4869,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX900-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: v_writelane_b32 v9, s30, 0
; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: v_writelane_b32 v9, s31, 1
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX900-NEXT: v_writelane_b32 v9, s30, 0
-; GFX900-NEXT: v_writelane_b32 v9, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: v_readlane_b32 s30, v9, 0
@@ -4922,14 +4912,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_store_dword off, v9, s33 ; 4-byte Folded Spill
; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: v_writelane_b32 v9, s30, 0
; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v9, s31, 1
; GFX950-NEXT: s_getpc_b64 s[0:1]
; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX950-NEXT: v_writelane_b32 v9, s30, 0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_writelane_b32 v9, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX950-NEXT: v_readlane_b32 s30, v9, 0
@@ -4955,13 +4945,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: v_writelane_b32 v9, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v9, s31, 1
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v9, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v9, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: v_readlane_b32 s30, v9, 0
@@ -4999,16 +4989,15 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: v_writelane_b32 v9, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v9, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v9, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v9, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v9, 0
; GFX11-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -5033,15 +5022,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) #0 {
; GFX1250-NEXT: scratch_store_b32 off, v9, s33 nv ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s0
-; GFX1250-NEXT: s_get_pc_i64 s[0:1]
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
; GFX1250-NEXT: v_writelane_b32 v9, s30, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_add_co_i32 s32, s32, 16
; GFX1250-NEXT: v_writelane_b32 v9, s31, 1
+; GFX1250-NEXT: s_get_pc_i64 s[0:1]
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16 at gotpcrel+4
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_readlane_b32 s30, v9, 0
; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -8352,6 +8340,17 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) #0
; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1
@@ -8387,17 +8386,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) #0
; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
-; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: flat_load_ushort v44, v[1:2]
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
@@ -8855,16 +8843,21 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) #0
; GFX950-LABEL: global_extload_v32bf16_to_v32f64:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
-; GFX950-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v2, v1
; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2
; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12
; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8
@@ -8897,11 +8890,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48
; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54
; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
; GFX950-NEXT: s_waitcnt vmcnt(31)
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX950-NEXT: s_waitcnt vmcnt(30)
@@ -12669,12 +12657,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) #0 {
; GFX950-LABEL: v_fadd_v32bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
@@ -17866,12 +17854,12 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) #0 {
; GFX950-LABEL: v_fmul_v32bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
@@ -22860,12 +22848,12 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) #0 {
; GFX950-LABEL: v_minnum_v32bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
@@ -27288,12 +27276,12 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) #0 {
; GFX950-LABEL: v_maxnum_v32bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
@@ -44747,6 +44735,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: v_writelane_b32 v33, s34, 0
+; GFX7-NEXT: v_writelane_b32 v33, s35, 1
+; GFX7-NEXT: v_writelane_b32 v33, s30, 2
+; GFX7-NEXT: v_writelane_b32 v33, s31, 3
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v1
@@ -44802,16 +44794,12 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX7-NEXT: v_and_b32_e32 v0, 1, v26
; GFX7-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v27
-; GFX7-NEXT: v_writelane_b32 v33, s34, 0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v28
-; GFX7-NEXT: v_writelane_b32 v33, s35, 1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v29
-; GFX7-NEXT: v_writelane_b32 v33, s30, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v30
-; GFX7-NEXT: v_writelane_b32 v33, s31, 3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -44915,6 +44903,14 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: v_writelane_b32 v34, s34, 0
+; GFX8-NEXT: v_writelane_b32 v34, s35, 1
+; GFX8-NEXT: v_writelane_b32 v34, s36, 2
+; GFX8-NEXT: v_writelane_b32 v34, s37, 3
+; GFX8-NEXT: v_writelane_b32 v34, s38, 4
+; GFX8-NEXT: v_writelane_b32 v34, s39, 5
+; GFX8-NEXT: v_writelane_b32 v34, s30, 6
+; GFX8-NEXT: v_writelane_b32 v34, s31, 7
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
@@ -44958,28 +44954,20 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_and_b32_e32 v0, 1, v20
; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v21
-; GFX8-NEXT: v_writelane_b32 v34, s34, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v22
-; GFX8-NEXT: v_writelane_b32 v34, s35, 1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v23
-; GFX8-NEXT: v_writelane_b32 v34, s36, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v24
-; GFX8-NEXT: v_writelane_b32 v34, s37, 3
; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT: v_writelane_b32 v34, s38, 4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT: v_writelane_b32 v34, s39, 5
; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT: v_writelane_b32 v34, s30, 6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT: v_writelane_b32 v34, s31, 7
; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v29
; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
@@ -45139,6 +45127,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: v_writelane_b32 v33, s34, 0
+; GFX900-NEXT: v_writelane_b32 v33, s35, 1
+; GFX900-NEXT: v_writelane_b32 v33, s30, 2
+; GFX900-NEXT: v_writelane_b32 v33, s31, 3
; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX900-NEXT: v_and_b32_e32 v0, 1, v3
@@ -45198,10 +45190,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: v_and_b32_e32 v0, 1, v28
; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32
-; GFX900-NEXT: v_writelane_b32 v33, s34, 0
-; GFX900-NEXT: v_writelane_b32 v33, s35, 1
-; GFX900-NEXT: v_writelane_b32 v33, s30, 2
-; GFX900-NEXT: v_writelane_b32 v33, s31, 3
; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX900-NEXT: s_waitcnt vmcnt(0)
@@ -45342,6 +45330,12 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:60
; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:124
; GFX950-NEXT: scratch_load_ushort v33, off, s32
@@ -45366,17 +45360,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:104
; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:40
; GFX950-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29
; GFX950-NEXT: scratch_load_dword v29, off, s32 offset:84
; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:20
; GFX950-NEXT: v_and_b32_e32 v28, 1, v28
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v28
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v26, 1, v26
; GFX950-NEXT: v_and_b32_e32 v27, 1, v27
; GFX950-NEXT: v_and_b32_e32 v24, 1, v24
@@ -50127,6 +50115,21 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
; GFX950-LABEL: v_fma_v32bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:64
; GFX950-NEXT: scratch_load_dword v32, off, s32
; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:60
@@ -50143,12 +50146,6 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:16
; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:12
; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:8
-; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v42, 0xffff0000, v14
; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v14
; GFX950-NEXT: v_and_b32_e32 v45, 0xffff0000, v29
@@ -50156,22 +50153,13 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
; GFX950-NEXT: v_and_b32_e32 v58, 0xffff0000, v12
; GFX950-NEXT: v_lshlrev_b32_e32 v59, 16, v28
; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v60, 0xffff0000, v27
; GFX950-NEXT: v_and_b32_e32 v61, 0xffff0000, v11
; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v15
; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v15
; GFX950-NEXT: v_and_b32_e32 v57, 0xffff0000, v28
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v41, 0xffff0000, v30
; GFX950-NEXT: v_lshlrev_b32_e32 v43, 16, v30
; GFX950-NEXT: v_and_b32_e32 v46, 0xffff0000, v13
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index fb11d3b7d9d65..2f6f9e45cafbf 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -939,10 +939,10 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v0, s99, 34
; CHECK-NEXT: v_writelane_b32 v0, s100, 35
; CHECK-NEXT: v_writelane_b32 v0, s101, 36
-; CHECK-NEXT: s_mov_b32 s40, s12
; CHECK-NEXT: v_writelane_b32 v0, s30, 37
-; CHECK-NEXT: s_cmp_eq_u32 s40, 0
; CHECK-NEXT: v_writelane_b32 v0, s31, 38
+; CHECK-NEXT: s_mov_b32 s40, s12
+; CHECK-NEXT: s_cmp_eq_u32 s40, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
index 485cedb33dcca..05adc43fe0463 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -18,12 +18,12 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -46,13 +46,13 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_bf16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_bf16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -77,12 +77,12 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -105,13 +105,13 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2bf16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2bf16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
index 2d49681c22496..9ae866ea33226 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
@@ -34,14 +34,14 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
; CHECK-NEXT: s_xor_saveexec_b64 s[40:41], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[40:41]
+; CHECK-NEXT: v_writelane_b32 v1, s30, 0
; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v1, s31, 1
; CHECK-NEXT: v_readfirstlane_b32 s42, v0
-; CHECK-NEXT: v_writelane_b32 v1, s30, 0
; CHECK-NEXT: s_getpc_b64 s[40:41]
; CHECK-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v0, s42
-; CHECK-NEXT: v_writelane_b32 v1, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
; CHECK-NEXT: v_readlane_b32 s30, v1, 0
; CHECK-NEXT: v_readlane_b32 s31, v1, 1
@@ -65,16 +65,16 @@ define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #
; CHECK-NEXT: s_xor_saveexec_b64 s[40:41], -1
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[40:41]
+; CHECK-NEXT: v_writelane_b32 v2, s30, 0
; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v2, s31, 1
; CHECK-NEXT: v_readfirstlane_b32 s42, v0
; CHECK-NEXT: v_readfirstlane_b32 s43, v1
-; CHECK-NEXT: v_writelane_b32 v2, s30, 0
; CHECK-NEXT: s_getpc_b64 s[40:41]
; CHECK-NEXT: s_add_u32 s40, s40, external_void_func_a16i32_inreg at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a16i32_inreg at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v0, s42
; CHECK-NEXT: v_mov_b32_e32 v1, s43
-; CHECK-NEXT: v_writelane_b32 v2, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
; CHECK-NEXT: v_readlane_b32 s30, v2, 0
; CHECK-NEXT: v_readlane_b32 s31, v2, 1
@@ -98,16 +98,16 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
; CHECK-NEXT: s_xor_saveexec_b64 s[40:41], -1
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[40:41]
+; CHECK-NEXT: v_writelane_b32 v2, s30, 0
; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v2, s31, 1
; CHECK-NEXT: v_readfirstlane_b32 s42, v0
; CHECK-NEXT: v_readfirstlane_b32 s43, v1
-; CHECK-NEXT: v_writelane_b32 v2, s30, 0
; CHECK-NEXT: s_getpc_b64 s[40:41]
; CHECK-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg_i32_inreg at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg_i32_inreg at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v0, s42
; CHECK-NEXT: v_mov_b32_e32 v1, s43
-; CHECK-NEXT: v_writelane_b32 v2, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
; CHECK-NEXT: v_readlane_b32 s30, v2, 0
; CHECK-NEXT: v_readlane_b32 s31, v2, 1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index ddd94108a1e39..d42666a4d4cf4 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -41,12 +41,12 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i8_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -69,13 +69,13 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -100,12 +100,12 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -128,13 +128,13 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -159,12 +159,12 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -187,13 +187,13 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -218,12 +218,12 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -246,13 +246,13 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_writelane_b32 v40, s2, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -277,12 +277,12 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -305,13 +305,13 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_writelane_b32 v40, s2, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -336,12 +336,12 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s19, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[20:21]
; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -364,13 +364,13 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s16
; GFX11-NEXT: v_writelane_b32 v40, s3, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[16:17]
; GFX11-NEXT: s_add_u32 s16, s16, external_void_func_v3i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s17, s17, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -395,12 +395,12 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-NEXT: v_writelane_b32 v40, s20, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[20:21]
; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -423,13 +423,13 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s17
; GFX11-NEXT: v_writelane_b32 v40, s16, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[16:17]
; GFX11-NEXT: s_add_u32 s16, s16, external_void_func_v4i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s17, s17, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -454,12 +454,12 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[26:27]
; GFX9-NEXT: v_writelane_b32 v40, s24, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[24:25]
; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -482,13 +482,13 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s21
; GFX11-NEXT: v_writelane_b32 v40, s20, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[20:21]
; GFX11-NEXT: s_add_u32 s20, s20, external_void_func_v8i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s21, s21, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -513,12 +513,12 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -541,13 +541,13 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -572,12 +572,12 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -600,13 +600,13 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -631,12 +631,12 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f64_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -659,13 +659,13 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_writelane_b32 v40, s2, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f64_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f64_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -690,12 +690,12 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -718,13 +718,13 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -749,12 +749,12 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -777,13 +777,13 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_writelane_b32 v40, s2, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -808,12 +808,12 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -836,13 +836,13 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_writelane_b32 v40, s2, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4f16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4f16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -867,12 +867,12 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p0_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -895,13 +895,13 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_writelane_b32 v40, s2, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_p0_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_p0_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -926,12 +926,12 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p1_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -954,13 +954,13 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_writelane_b32 v40, s2, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_p1_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_p1_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -985,12 +985,12 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p3_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1013,13 +1013,13 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: v_writelane_b32 v40, s1, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_p3_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_p3_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1044,12 +1044,12 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-NEXT: v_writelane_b32 v40, s20, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[20:21]
; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1072,13 +1072,13 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; GFX11-NEXT: s_mov_b32 exec_lo, s17
; GFX11-NEXT: v_writelane_b32 v40, s16, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[16:17]
; GFX11-NEXT: s_add_u32 s16, s16, external_void_func_v2p1_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s17, s17, external_void_func_v2p1_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1103,12 +1103,12 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p5_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1131,13 +1131,13 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_writelane_b32 v40, s2, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2p5_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2p5_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1162,12 +1162,12 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-NEXT: v_writelane_b32 v40, s21, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[22:23]
; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1190,13 +1190,13 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
; GFX11-NEXT: s_mov_b32 exec_lo, s18
; GFX11-NEXT: v_writelane_b32 v40, s17, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[18:19]
; GFX11-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1221,12 +1221,12 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-NEXT: v_writelane_b32 v40, s29, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[40:41]
; GFX9-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[40:41]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1249,13 +1249,13 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
; GFX11-NEXT: s_mov_b32 exec_lo, s26
; GFX11-NEXT: v_writelane_b32 v40, s25, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[26:27]
; GFX11-NEXT: s_add_u32 s26, s26, external_void_func_a15i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s27, s27, external_void_func_a15i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[26:27]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1282,12 +1282,12 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-NEXT: v_writelane_b32 v40, s21, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[22:23]
; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1310,13 +1310,13 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
; GFX11-NEXT: s_mov_b32 exec_lo, s18
; GFX11-NEXT: v_writelane_b32 v40, s17, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[18:19]
; GFX11-NEXT: s_add_u32 s18, s18, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s19, s19, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 834517d9c9b39..67c15f0d041cc 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -7130,7 +7130,10 @@ define void @stack_12xv3i32() #0 {
; VI-NEXT: s_or_saveexec_b64 s[8:9], -1
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[8:9]
+; VI-NEXT: v_writelane_b32 v40, s4, 2
+; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: s_addk_i32 s32, 0x400
+; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: v_mov_b32_e32 v0, 11
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32
; VI-NEXT: v_mov_b32_e32 v0, 12
@@ -7138,10 +7141,8 @@ define void @stack_12xv3i32() #0 {
; VI-NEXT: v_mov_b32_e32 v0, 13
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; VI-NEXT: v_mov_b32_e32 v0, 14
-; VI-NEXT: v_writelane_b32 v40, s4, 2
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; VI-NEXT: v_mov_b32_e32 v0, 15
-; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
@@ -7177,7 +7178,6 @@ define void @stack_12xv3i32() #0 {
; VI-NEXT: v_mov_b32_e32 v28, 9
; VI-NEXT: v_mov_b32_e32 v29, 9
; VI-NEXT: v_mov_b32_e32 v30, 10
-; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: v_readlane_b32 s30, v40, 0
; VI-NEXT: v_readlane_b32 s31, v40, 1
@@ -7198,7 +7198,10 @@ define void @stack_12xv3i32() #0 {
; CI-NEXT: s_or_saveexec_b64 s[8:9], -1
; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CI-NEXT: s_mov_b64 exec, s[8:9]
+; CI-NEXT: v_writelane_b32 v40, s4, 2
+; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: s_addk_i32 s32, 0x400
+; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: v_mov_b32_e32 v0, 11
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32
; CI-NEXT: v_mov_b32_e32 v0, 12
@@ -7206,10 +7209,8 @@ define void @stack_12xv3i32() #0 {
; CI-NEXT: v_mov_b32_e32 v0, 13
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; CI-NEXT: v_mov_b32_e32 v0, 14
-; CI-NEXT: v_writelane_b32 v40, s4, 2
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; CI-NEXT: v_mov_b32_e32 v0, 15
-; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
@@ -7245,7 +7246,6 @@ define void @stack_12xv3i32() #0 {
; CI-NEXT: v_mov_b32_e32 v28, 9
; CI-NEXT: v_mov_b32_e32 v29, 9
; CI-NEXT: v_mov_b32_e32 v30, 10
-; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: v_readlane_b32 s30, v40, 0
; CI-NEXT: v_readlane_b32 s31, v40, 1
@@ -7266,7 +7266,10 @@ define void @stack_12xv3i32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-NEXT: v_writelane_b32 v40, s4, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 11
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v0, 12
@@ -7274,10 +7277,8 @@ define void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, 14
-; GFX9-NEXT: v_writelane_b32 v40, s4, 2
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
@@ -7313,7 +7314,6 @@ define void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v28, 9
; GFX9-NEXT: v_mov_b32_e32 v29, 9
; GFX9-NEXT: v_mov_b32_e32 v30, 10
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7335,11 +7335,12 @@ define void @stack_12xv3i32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12
; GFX11-NEXT: v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14
; GFX11-NEXT: v_mov_b32_e32 v4, 15
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b32 off, v4, s0
@@ -7362,9 +7363,8 @@ define void @stack_12xv3i32() #0 {
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7384,7 +7384,10 @@ define void @stack_12xv3i32() #0 {
; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1
; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HSA-NEXT: s_mov_b64 exec, s[8:9]
+; HSA-NEXT: v_writelane_b32 v40, s4, 2
+; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: s_addk_i32 s32, 0x400
+; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: v_mov_b32_e32 v0, 11
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT: v_mov_b32_e32 v0, 12
@@ -7392,10 +7395,8 @@ define void @stack_12xv3i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v0, 13
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; HSA-NEXT: v_mov_b32_e32 v0, 14
-; HSA-NEXT: v_writelane_b32 v40, s4, 2
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; HSA-NEXT: v_mov_b32_e32 v0, 15
-; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
@@ -7431,7 +7432,6 @@ define void @stack_12xv3i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v28, 9
; HSA-NEXT: v_mov_b32_e32 v29, 9
; HSA-NEXT: v_mov_b32_e32 v30, 10
-; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT: v_readlane_b32 s30, v40, 0
; HSA-NEXT: v_readlane_b32 s31, v40, 1
@@ -7469,7 +7469,10 @@ define void @stack_12xv3f32() #0 {
; VI-NEXT: s_or_saveexec_b64 s[8:9], -1
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[8:9]
+; VI-NEXT: v_writelane_b32 v40, s4, 2
+; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: s_addk_i32 s32, 0x400
+; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: v_mov_b32_e32 v0, 0x41300000
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32
; VI-NEXT: v_mov_b32_e32 v0, 0x41400000
@@ -7477,10 +7480,8 @@ define void @stack_12xv3f32() #0 {
; VI-NEXT: v_mov_b32_e32 v0, 0x41500000
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; VI-NEXT: v_mov_b32_e32 v0, 0x41600000
-; VI-NEXT: v_writelane_b32 v40, s4, 2
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; VI-NEXT: v_mov_b32_e32 v0, 0x41700000
-; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
@@ -7516,7 +7517,6 @@ define void @stack_12xv3f32() #0 {
; VI-NEXT: v_mov_b32_e32 v28, 0x41100000
; VI-NEXT: v_mov_b32_e32 v29, 0x41100000
; VI-NEXT: v_mov_b32_e32 v30, 0x41200000
-; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: v_readlane_b32 s30, v40, 0
; VI-NEXT: v_readlane_b32 s31, v40, 1
@@ -7537,7 +7537,10 @@ define void @stack_12xv3f32() #0 {
; CI-NEXT: s_or_saveexec_b64 s[8:9], -1
; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CI-NEXT: s_mov_b64 exec, s[8:9]
+; CI-NEXT: v_writelane_b32 v40, s4, 2
+; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: s_addk_i32 s32, 0x400
+; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: v_mov_b32_e32 v0, 0x41300000
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32
; CI-NEXT: v_mov_b32_e32 v0, 0x41400000
@@ -7545,10 +7548,8 @@ define void @stack_12xv3f32() #0 {
; CI-NEXT: v_mov_b32_e32 v0, 0x41500000
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; CI-NEXT: v_mov_b32_e32 v0, 0x41600000
-; CI-NEXT: v_writelane_b32 v40, s4, 2
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; CI-NEXT: v_mov_b32_e32 v0, 0x41700000
-; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
@@ -7584,7 +7585,6 @@ define void @stack_12xv3f32() #0 {
; CI-NEXT: v_mov_b32_e32 v28, 0x41100000
; CI-NEXT: v_mov_b32_e32 v29, 0x41100000
; CI-NEXT: v_mov_b32_e32 v30, 0x41200000
-; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: v_readlane_b32 s30, v40, 0
; CI-NEXT: v_readlane_b32 s31, v40, 1
@@ -7605,7 +7605,10 @@ define void @stack_12xv3f32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-NEXT: v_writelane_b32 v40, s4, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41400000
@@ -7613,10 +7616,8 @@ define void @stack_12xv3f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000
-; GFX9-NEXT: v_writelane_b32 v40, s4, 2
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
@@ -7652,7 +7653,6 @@ define void @stack_12xv3f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v28, 0x41100000
; GFX9-NEXT: v_mov_b32_e32 v29, 0x41100000
; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7674,13 +7674,14 @@ define void @stack_12xv3f32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_mov_b32_e32 v0, 0x41300000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x41400000
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41500000
; GFX11-NEXT: v_mov_b32_e32 v3, 0x41600000
; GFX11-NEXT: v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b32 off, v4, s0
@@ -7705,9 +7706,8 @@ define void @stack_12xv3f32() #0 {
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3f32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7727,7 +7727,10 @@ define void @stack_12xv3f32() #0 {
; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1
; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HSA-NEXT: s_mov_b64 exec, s[8:9]
+; HSA-NEXT: v_writelane_b32 v40, s4, 2
+; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: s_addk_i32 s32, 0x400
+; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: v_mov_b32_e32 v0, 0x41300000
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT: v_mov_b32_e32 v0, 0x41400000
@@ -7735,10 +7738,8 @@ define void @stack_12xv3f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000
-; HSA-NEXT: v_writelane_b32 v40, s4, 2
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000
-; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
@@ -7774,7 +7775,6 @@ define void @stack_12xv3f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v28, 0x41100000
; HSA-NEXT: v_mov_b32_e32 v29, 0x41100000
; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000
-; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT: v_readlane_b32 s30, v40, 0
; HSA-NEXT: v_readlane_b32 s31, v40, 1
@@ -7812,7 +7812,10 @@ define void @stack_8xv5i32() #0 {
; VI-NEXT: s_or_saveexec_b64 s[8:9], -1
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[8:9]
+; VI-NEXT: v_writelane_b32 v40, s4, 2
+; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: s_addk_i32 s32, 0x400
+; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: v_mov_b32_e32 v0, 7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32
; VI-NEXT: v_mov_b32_e32 v0, 8
@@ -7828,10 +7831,8 @@ define void @stack_8xv5i32() #0 {
; VI-NEXT: v_mov_b32_e32 v0, 13
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; VI-NEXT: v_mov_b32_e32 v0, 14
-; VI-NEXT: v_writelane_b32 v40, s4, 2
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; VI-NEXT: v_mov_b32_e32 v0, 15
-; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
@@ -7867,7 +7868,6 @@ define void @stack_8xv5i32() #0 {
; VI-NEXT: v_mov_b32_e32 v28, 5
; VI-NEXT: v_mov_b32_e32 v29, 5
; VI-NEXT: v_mov_b32_e32 v30, 6
-; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: v_readlane_b32 s30, v40, 0
; VI-NEXT: v_readlane_b32 s31, v40, 1
@@ -7888,7 +7888,10 @@ define void @stack_8xv5i32() #0 {
; CI-NEXT: s_or_saveexec_b64 s[8:9], -1
; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CI-NEXT: s_mov_b64 exec, s[8:9]
+; CI-NEXT: v_writelane_b32 v40, s4, 2
+; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: s_addk_i32 s32, 0x400
+; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: v_mov_b32_e32 v0, 7
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32
; CI-NEXT: v_mov_b32_e32 v0, 8
@@ -7904,10 +7907,8 @@ define void @stack_8xv5i32() #0 {
; CI-NEXT: v_mov_b32_e32 v0, 13
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; CI-NEXT: v_mov_b32_e32 v0, 14
-; CI-NEXT: v_writelane_b32 v40, s4, 2
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; CI-NEXT: v_mov_b32_e32 v0, 15
-; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
@@ -7943,7 +7944,6 @@ define void @stack_8xv5i32() #0 {
; CI-NEXT: v_mov_b32_e32 v28, 5
; CI-NEXT: v_mov_b32_e32 v29, 5
; CI-NEXT: v_mov_b32_e32 v30, 6
-; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: v_readlane_b32 s30, v40, 0
; CI-NEXT: v_readlane_b32 s31, v40, 1
@@ -7964,7 +7964,10 @@ define void @stack_8xv5i32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-NEXT: v_writelane_b32 v40, s4, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v0, 8
@@ -7980,10 +7983,8 @@ define void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT: v_mov_b32_e32 v0, 14
-; GFX9-NEXT: v_writelane_b32 v40, s4, 2
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
@@ -8019,7 +8020,6 @@ define void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v28, 5
; GFX9-NEXT: v_mov_b32_e32 v29, 5
; GFX9-NEXT: v_mov_b32_e32 v30, 6
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -8041,15 +8041,16 @@ define void @stack_8xv5i32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8
; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10
; GFX11-NEXT: v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14
; GFX11-NEXT: v_mov_b32_e32 v6, 13
; GFX11-NEXT: s_add_i32 s0, s32, 32
; GFX11-NEXT: s_add_i32 s1, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
@@ -8073,9 +8074,8 @@ define void @stack_8xv5i32() #0 {
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -8095,7 +8095,10 @@ define void @stack_8xv5i32() #0 {
; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1
; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HSA-NEXT: s_mov_b64 exec, s[8:9]
+; HSA-NEXT: v_writelane_b32 v40, s4, 2
+; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: s_addk_i32 s32, 0x400
+; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: v_mov_b32_e32 v0, 7
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT: v_mov_b32_e32 v0, 8
@@ -8111,10 +8114,8 @@ define void @stack_8xv5i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v0, 13
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; HSA-NEXT: v_mov_b32_e32 v0, 14
-; HSA-NEXT: v_writelane_b32 v40, s4, 2
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; HSA-NEXT: v_mov_b32_e32 v0, 15
-; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
@@ -8150,7 +8151,6 @@ define void @stack_8xv5i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v28, 5
; HSA-NEXT: v_mov_b32_e32 v29, 5
; HSA-NEXT: v_mov_b32_e32 v30, 6
-; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT: v_readlane_b32 s30, v40, 0
; HSA-NEXT: v_readlane_b32 s31, v40, 1
@@ -8184,7 +8184,10 @@ define void @stack_8xv5f32() #0 {
; VI-NEXT: s_or_saveexec_b64 s[8:9], -1
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[8:9]
+; VI-NEXT: v_writelane_b32 v40, s4, 2
+; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: s_addk_i32 s32, 0x400
+; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: v_mov_b32_e32 v0, 0x40e00000
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32
; VI-NEXT: v_mov_b32_e32 v0, 0x41000000
@@ -8200,10 +8203,8 @@ define void @stack_8xv5f32() #0 {
; VI-NEXT: v_mov_b32_e32 v0, 0x41500000
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; VI-NEXT: v_mov_b32_e32 v0, 0x41600000
-; VI-NEXT: v_writelane_b32 v40, s4, 2
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; VI-NEXT: v_mov_b32_e32 v0, 0x41700000
-; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
@@ -8239,7 +8240,6 @@ define void @stack_8xv5f32() #0 {
; VI-NEXT: v_mov_b32_e32 v28, 0x40a00000
; VI-NEXT: v_mov_b32_e32 v29, 0x40a00000
; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000
-; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: v_readlane_b32 s30, v40, 0
; VI-NEXT: v_readlane_b32 s31, v40, 1
@@ -8260,7 +8260,10 @@ define void @stack_8xv5f32() #0 {
; CI-NEXT: s_or_saveexec_b64 s[8:9], -1
; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CI-NEXT: s_mov_b64 exec, s[8:9]
+; CI-NEXT: v_writelane_b32 v40, s4, 2
+; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: s_addk_i32 s32, 0x400
+; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: v_mov_b32_e32 v0, 0x40e00000
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32
; CI-NEXT: v_mov_b32_e32 v0, 0x41000000
@@ -8276,10 +8279,8 @@ define void @stack_8xv5f32() #0 {
; CI-NEXT: v_mov_b32_e32 v0, 0x41500000
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; CI-NEXT: v_mov_b32_e32 v0, 0x41600000
-; CI-NEXT: v_writelane_b32 v40, s4, 2
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; CI-NEXT: v_mov_b32_e32 v0, 0x41700000
-; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
@@ -8315,7 +8316,6 @@ define void @stack_8xv5f32() #0 {
; CI-NEXT: v_mov_b32_e32 v28, 0x40a00000
; CI-NEXT: v_mov_b32_e32 v29, 0x40a00000
; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000
-; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: v_readlane_b32 s30, v40, 0
; CI-NEXT: v_readlane_b32 s31, v40, 1
@@ -8336,7 +8336,10 @@ define void @stack_8xv5f32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-NEXT: v_writelane_b32 v40, s4, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0x40e00000
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000
@@ -8352,10 +8355,8 @@ define void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000
-; GFX9-NEXT: v_writelane_b32 v40, s4, 2
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
@@ -8391,7 +8392,6 @@ define void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v28, 0x40a00000
; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000
; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -8413,19 +8413,20 @@ define void @stack_8xv5f32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40e00000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41100000
; GFX11-NEXT: v_mov_b32_e32 v3, 0x41200000
; GFX11-NEXT: v_mov_b32_e32 v8, 0x41700000
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_mov_b32_e32 v4, 0x41300000
; GFX11-NEXT: v_mov_b32_e32 v5, 0x41400000
; GFX11-NEXT: v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0
; GFX11-NEXT: v_mov_b32_e32 v7, 0x41600000
; GFX11-NEXT: s_add_i32 s0, s32, 32
; GFX11-NEXT: s_add_i32 s1, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b32 off, v8, s0
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1
@@ -8448,9 +8449,8 @@ define void @stack_8xv5f32() #0 {
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -8470,7 +8470,10 @@ define void @stack_8xv5f32() #0 {
; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1
; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HSA-NEXT: s_mov_b64 exec, s[8:9]
+; HSA-NEXT: v_writelane_b32 v40, s4, 2
+; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: s_addk_i32 s32, 0x400
+; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: v_mov_b32_e32 v0, 0x40e00000
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32
; HSA-NEXT: v_mov_b32_e32 v0, 0x41000000
@@ -8486,10 +8489,8 @@ define void @stack_8xv5f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000
-; HSA-NEXT: v_writelane_b32 v40, s4, 2
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000
-; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
@@ -8525,7 +8526,6 @@ define void @stack_8xv5f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v28, 0x40a00000
; HSA-NEXT: v_mov_b32_e32 v29, 0x40a00000
; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000
-; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT: v_readlane_b32 s30, v40, 0
; HSA-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index d443ad62ac7ef..6a6b7e84ed1ab 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -40,14 +40,14 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: v_writelane_b32 v40, s35, 1
; MUBUF-NEXT: v_writelane_b32 v40, s30, 2
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 3
; MUBUF-NEXT: s_getpc_b64 s[34:35]
; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 3
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
@@ -74,14 +74,14 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3
; FLATSCR-NEXT: s_getpc_b64 s[34:35]
; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
@@ -114,14 +114,14 @@ define void @test_func_call_external_void_funcx2() #0 {
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: v_writelane_b32 v40, s35, 1
; MUBUF-NEXT: v_writelane_b32 v40, s30, 2
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 3
; MUBUF-NEXT: s_getpc_b64 s[34:35]
; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 3
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
; MUBUF-NEXT: v_readlane_b32 s30, v40, 2
@@ -146,14 +146,14 @@ define void @test_func_call_external_void_funcx2() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3
; FLATSCR-NEXT: s_getpc_b64 s[34:35]
; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2
@@ -454,10 +454,10 @@ define void @callee_saved_sgpr_func() #2 {
; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: v_writelane_b32 v40, s30, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 2
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 2
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; def s40
; MUBUF-NEXT: ;;#ASMEND
@@ -490,10 +490,10 @@ define void @callee_saved_sgpr_func() #2 {
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 2
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 2
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; def s40
; FLATSCR-NEXT: ;;#ASMEND
@@ -555,13 +555,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v41, s4, 3
; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: v_writelane_b32 v41, s34, 0
; MUBUF-NEXT: v_writelane_b32 v41, s30, 1
+; MUBUF-NEXT: v_writelane_b32 v41, s31, 2
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT: v_writelane_b32 v41, s31, 2
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; def s40
; MUBUF-NEXT: ;;#ASMEND
@@ -599,13 +599,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: v_writelane_b32 v41, s34, 0
; FLATSCR-NEXT: v_writelane_b32 v41, s30, 1
+; FLATSCR-NEXT: v_writelane_b32 v41, s31, 2
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT: v_writelane_b32 v41, s31, 2
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; def s40
; FLATSCR-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll
index e8bf70da933c0..a257464984453 100644
--- a/llvm/test/CodeGen/AMDGPU/call-skip.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll
@@ -22,9 +22,9 @@ define void @if_call(i32 %flag) #0 {
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: v_writelane_b32 v1, s30, 0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v1, s31, 1
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[16:17], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_2
; GCN-NEXT: ; %bb.1: ; %call
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 962df52e984b2..96e98b18c1a90 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -122,13 +122,13 @@ define void @callee_with_stack_and_call() #0 {
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[18:19]
; MUBUF-NEXT: v_writelane_b32 v40, s16, 2
-; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: s_getpc_b64 s[16:17]
; MUBUF-NEXT: s_add_u32 s16, s16, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -152,13 +152,13 @@ define void @callee_with_stack_and_call() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
-; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -194,12 +194,12 @@ define void @callee_no_stack_with_call() #0 {
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[18:19]
; MUBUF-NEXT: v_writelane_b32 v40, s16, 2
-; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_getpc_b64 s[16:17]
; MUBUF-NEXT: s_add_u32 s16, s16, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
@@ -221,12 +221,12 @@ define void @callee_no_stack_with_call() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
-; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
@@ -470,7 +470,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
; Has no spilled CSR VGPRs used for SGPR spilling, so no need to
; enable all lanes and restore.
-define void @spill_only_csr_sgpr() {
+define void @spill_only_csr_sgpr() #0 {
; GCN-LABEL: spill_only_csr_sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -489,15 +489,15 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT: s_mov_b32 s4, s33
; MUBUF-NEXT: s_mov_b32 s33, s32
-; MUBUF-NEXT: v_mov_b32_e32 v0, 0
+; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber v41
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_mov_b32 s33, s4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
@@ -508,15 +508,15 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: s_mov_b32 s0, s33
; FLATSCR-NEXT: s_mov_b32 s33, s32
-; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber v41
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_mov_b32 s33, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -537,6 +537,8 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: v_writelane_b32 v1, s48, 0
; MUBUF-NEXT: v_writelane_b32 v1, s49, 1
; MUBUF-NEXT: v_writelane_b32 v1, s50, 2
@@ -566,19 +568,17 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
; MUBUF-NEXT: v_writelane_b32 v1, s98, 26
; MUBUF-NEXT: v_writelane_b32 v1, s99, 27
; MUBUF-NEXT: v_writelane_b32 v1, s100, 28
-; MUBUF-NEXT: v_mov_b32_e32 v0, 0
-; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: v_writelane_b32 v1, s101, 29
+; MUBUF-NEXT: v_writelane_b32 v1, s102, 30
+; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber v41
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: v_writelane_b32 v1, s102, 30
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_readlane_b32 s102, v1, 30
; MUBUF-NEXT: v_readlane_b32 s101, v1, 29
; MUBUF-NEXT: v_readlane_b32 s100, v1, 28
@@ -626,6 +626,8 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1
; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: v_writelane_b32 v1, s48, 0
; FLATSCR-NEXT: v_writelane_b32 v1, s49, 1
; FLATSCR-NEXT: v_writelane_b32 v1, s50, 2
@@ -655,19 +657,17 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
; FLATSCR-NEXT: v_writelane_b32 v1, s98, 26
; FLATSCR-NEXT: v_writelane_b32 v1, s99, 27
; FLATSCR-NEXT: v_writelane_b32 v1, s100, 28
-; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: v_writelane_b32 v1, s101, 29
+; FLATSCR-NEXT: v_writelane_b32 v1, s102, 30
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber v41
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_writelane_b32 v1, s102, 30
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_readlane_b32 s102, v1, 30
; FLATSCR-NEXT: v_readlane_b32 s101, v1, 29
; FLATSCR-NEXT: v_readlane_b32 s100, v1, 28
@@ -731,6 +731,8 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: v_writelane_b32 v1, s39, 0
; MUBUF-NEXT: v_writelane_b32 v1, s48, 1
; MUBUF-NEXT: v_writelane_b32 v1, s49, 2
@@ -761,19 +763,17 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; MUBUF-NEXT: v_writelane_b32 v1, s98, 27
; MUBUF-NEXT: v_writelane_b32 v1, s99, 28
; MUBUF-NEXT: v_writelane_b32 v1, s100, 29
-; MUBUF-NEXT: v_mov_b32_e32 v0, 0
-; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: v_writelane_b32 v1, s101, 30
+; MUBUF-NEXT: v_writelane_b32 v1, s102, 31
+; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber v41
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: v_writelane_b32 v1, s102, 31
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_readlane_b32 s102, v1, 31
; MUBUF-NEXT: v_readlane_b32 s101, v1, 30
; MUBUF-NEXT: v_readlane_b32 s100, v1, 29
@@ -822,6 +822,8 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1
; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: v_writelane_b32 v1, s39, 0
; FLATSCR-NEXT: v_writelane_b32 v1, s48, 1
; FLATSCR-NEXT: v_writelane_b32 v1, s49, 2
@@ -852,19 +854,17 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; FLATSCR-NEXT: v_writelane_b32 v1, s98, 27
; FLATSCR-NEXT: v_writelane_b32 v1, s99, 28
; FLATSCR-NEXT: v_writelane_b32 v1, s100, 29
-; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: v_writelane_b32 v1, s101, 30
+; FLATSCR-NEXT: v_writelane_b32 v1, s102, 31
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber v41
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_writelane_b32 v1, s102, 31
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_readlane_b32 s102, v1, 31
; FLATSCR-NEXT: v_readlane_b32 s101, v1, 30
; FLATSCR-NEXT: v_readlane_b32 s100, v1, 29
@@ -970,13 +970,13 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: v_writelane_b32 v1, s30, 0
-; MUBUF-NEXT: v_mov_b32_e32 v0, 0
+; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_writelane_b32 v1, s31, 1
+; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_readlane_b32 s30, v1, 0
; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
@@ -996,13 +996,13 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0
-; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0
; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
@@ -1036,13 +1036,13 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_mov_b32_e32 v0, 0
+; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved initial VGPRs
@@ -1065,13 +1065,13 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved initial VGPRs
@@ -1116,15 +1116,15 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s6 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved SGPRs
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved VGPRs
@@ -1148,11 +1148,11 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004
; FLATSCR-NEXT: scratch_store_dword off, v40, s2 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT: s_addk_i32 s32, 0x100c
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: s_addk_i32 s32, 0x100c
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_add_i32 s0, s33, 0x1000
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
@@ -1210,13 +1210,13 @@ define void @ipra_call_with_stack() #0 {
; MUBUF-NEXT: s_xor_saveexec_b64 s[16:17], -1
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[16:17]
-; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_writelane_b32 v1, s30, 0
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v1, s31, 1
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: s_getpc_b64 s[16:17]
; MUBUF-NEXT: s_add_u32 s16, s16, local_empty_func at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s17, s17, local_empty_func at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v1, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -1238,13 +1238,13 @@ define void @ipra_call_with_stack() #0 {
; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, local_empty_func at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, local_empty_func at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -1369,6 +1369,7 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v39, s4, 32
+; MUBUF-NEXT: s_addk_i32 s32, 0x200
; MUBUF-NEXT: v_writelane_b32 v39, s39, 0
; MUBUF-NEXT: v_writelane_b32 v39, s48, 1
; MUBUF-NEXT: v_writelane_b32 v39, s49, 2
@@ -1400,7 +1401,6 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
; MUBUF-NEXT: v_writelane_b32 v39, s99, 28
; MUBUF-NEXT: v_writelane_b32 v39, s100, 29
; MUBUF-NEXT: v_writelane_b32 v39, s101, 30
-; MUBUF-NEXT: s_addk_i32 s32, 0x200
; MUBUF-NEXT: v_writelane_b32 v39, s102, 31
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs
@@ -1457,6 +1457,7 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1
; FLATSCR-NEXT: scratch_store_dword off, v39, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_add_i32 s32, s32, 8
; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0
; FLATSCR-NEXT: v_writelane_b32 v39, s48, 1
; FLATSCR-NEXT: v_writelane_b32 v39, s49, 2
@@ -1488,7 +1489,6 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
; FLATSCR-NEXT: v_writelane_b32 v39, s99, 28
; FLATSCR-NEXT: v_writelane_b32 v39, s100, 29
; FLATSCR-NEXT: v_writelane_b32 v39, s101, 30
-; FLATSCR-NEXT: s_add_i32 s32, s32, 8
; FLATSCR-NEXT: v_writelane_b32 v39, s102, 31
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs
@@ -1569,6 +1569,7 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 32
+; MUBUF-NEXT: s_addk_i32 s32, 0x200
; MUBUF-NEXT: v_writelane_b32 v40, s39, 0
; MUBUF-NEXT: v_writelane_b32 v40, s48, 1
; MUBUF-NEXT: v_writelane_b32 v40, s49, 2
@@ -1600,7 +1601,6 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
; MUBUF-NEXT: v_writelane_b32 v40, s99, 28
; MUBUF-NEXT: v_writelane_b32 v40, s100, 29
; MUBUF-NEXT: v_writelane_b32 v40, s101, 30
-; MUBUF-NEXT: s_addk_i32 s32, 0x200
; MUBUF-NEXT: v_writelane_b32 v40, s102, 31
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs
@@ -1657,6 +1657,7 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_add_i32 s32, s32, 8
; FLATSCR-NEXT: v_writelane_b32 v40, s39, 0
; FLATSCR-NEXT: v_writelane_b32 v40, s48, 1
; FLATSCR-NEXT: v_writelane_b32 v40, s49, 2
@@ -1688,7 +1689,6 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
; FLATSCR-NEXT: v_writelane_b32 v40, s99, 28
; FLATSCR-NEXT: v_writelane_b32 v40, s100, 29
; FLATSCR-NEXT: v_writelane_b32 v40, s101, 30
-; FLATSCR-NEXT: s_add_i32 s32, s32, 8
; FLATSCR-NEXT: v_writelane_b32 v40, s102, 31
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs
@@ -1768,6 +1768,7 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5)
; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v39, s4, 32
+; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300
; MUBUF-NEXT: v_writelane_b32 v39, s39, 0
; MUBUF-NEXT: v_writelane_b32 v39, s48, 1
; MUBUF-NEXT: v_writelane_b32 v39, s49, 2
@@ -1799,10 +1800,9 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5)
; MUBUF-NEXT: v_writelane_b32 v39, s99, 28
; MUBUF-NEXT: v_writelane_b32 v39, s100, 29
; MUBUF-NEXT: v_writelane_b32 v39, s101, 30
+; MUBUF-NEXT: v_writelane_b32 v39, s102, 31
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000
-; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300
-; MUBUF-NEXT: v_writelane_b32 v39, s102, 31
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
@@ -1862,6 +1862,7 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5)
; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1004
; FLATSCR-NEXT: scratch_store_dword off, v39, s1 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_addk_i32 s32, 0x100c
; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0
; FLATSCR-NEXT: v_writelane_b32 v39, s48, 1
; FLATSCR-NEXT: v_writelane_b32 v39, s49, 2
@@ -1891,12 +1892,11 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5)
; FLATSCR-NEXT: v_writelane_b32 v39, s97, 26
; FLATSCR-NEXT: v_writelane_b32 v39, s98, 27
; FLATSCR-NEXT: v_writelane_b32 v39, s99, 28
-; FLATSCR-NEXT: s_addk_i32 s32, 0x100c
; FLATSCR-NEXT: v_writelane_b32 v39, s100, 29
; FLATSCR-NEXT: v_writelane_b32 v39, s101, 30
+; FLATSCR-NEXT: v_writelane_b32 v39, s102, 31
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1000
-; FLATSCR-NEXT: v_writelane_b32 v39, s102, 31
; FLATSCR-NEXT: scratch_store_dword off, v0, s1
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
@@ -1977,15 +1977,15 @@ define void @dont_save_fp_bp_for_noreturn_funcs() #4 {
; MUBUF-NEXT: s_or_saveexec_b64 s[16:17], -1
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[16:17]
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: s_mov_b32 s34, s32
; MUBUF-NEXT: s_addk_i32 s32, 0x4000
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_getpc_b64 s[16:17]
; MUBUF-NEXT: s_add_u32 s16, s16, dont_save_fp_bp_for_noreturn_funcs at gotpcrel32@lo+4
; MUBUF-NEXT: s_addc_u32 s17, s17, dont_save_fp_bp_for_noreturn_funcs at gotpcrel32@hi+12
; MUBUF-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -1998,15 +1998,15 @@ define void @dont_save_fp_bp_for_noreturn_funcs() #4 {
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: s_mov_b32 s34, s32
; FLATSCR-NEXT: s_addk_i32 s32, 0x100
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, dont_save_fp_bp_for_noreturn_funcs at gotpcrel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, dont_save_fp_bp_for_noreturn_funcs at gotpcrel32@hi+12
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index bb5963244da3c..a7009c4d20e33 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -420,14 +420,14 @@ define void @func_indirect_use_workitem_id_x() #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_x at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_x at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -453,14 +453,14 @@ define void @func_indirect_use_workitem_id_y() #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_y at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_y at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -486,14 +486,14 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_z at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_z at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -939,8 +939,10 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s4, 2
+; GFX7-NEXT: v_writelane_b32 v40, s30, 0
+; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_getpc_b64 s[4:5]
; GFX7-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x at gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x at gotpcrel32@hi+12
@@ -948,7 +950,6 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX7-NEXT: flat_store_dword v[0:1], v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, 0x140
-; GFX7-NEXT: v_writelane_b32 v40, s30, 0
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX7-NEXT: v_mov_b32_e32 v0, 10
; GFX7-NEXT: v_mov_b32_e32 v1, 20
@@ -981,7 +982,6 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX7-NEXT: v_mov_b32_e32 v28, 0x122
; GFX7-NEXT: v_mov_b32_e32 v29, 0x12c
; GFX7-NEXT: v_mov_b32_e32 v30, 0x136
-; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
@@ -1003,8 +1003,10 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX90A-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT: s_addk_i32 s32, 0x400
; GFX90A-NEXT: v_writelane_b32 v40, s4, 2
+; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
+; GFX90A-NEXT: s_addk_i32 s32, 0x400
+; GFX90A-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-NEXT: s_getpc_b64 s[4:5]
; GFX90A-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x at gotpcrel32@lo+4
; GFX90A-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x at gotpcrel32@hi+12
@@ -1012,7 +1014,6 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x140
-; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX90A-NEXT: v_mov_b32_e32 v0, 10
; GFX90A-NEXT: v_mov_b32_e32 v1, 20
@@ -1045,7 +1046,6 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX90A-NEXT: v_mov_b32_e32 v28, 0x122
; GFX90A-NEXT: v_mov_b32_e32 v29, 0x12c
; GFX90A-NEXT: v_mov_b32_e32 v30, 0x136
-; GFX90A-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
@@ -1081,15 +1081,15 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -1396,19 +1396,20 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_mov_b32_e32 v0, 0x3e7
+; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3e7
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0x140
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33
-; GCN-NEXT: v_writelane_b32 v40, s4, 2
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_byval at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_byval at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_mov_b32_e32 v1, 20
; GCN-NEXT: v_mov_b32_e32 v2, 30
; GCN-NEXT: v_mov_b32_e32 v3, 40
@@ -1439,7 +1440,6 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-NEXT: v_mov_b32_e32 v28, 0x122
; GCN-NEXT: v_mov_b32_e32 v29, 0x12c
; GCN-NEXT: v_mov_b32_e32 v30, 0x136
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mov_b32_e32 v0, 10
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index f20be656f3af0..718140f82887e 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -265,14 +265,14 @@ define void @func_indirect_use_workitem_id_x() #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_x at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_x at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -298,14 +298,14 @@ define void @func_indirect_use_workitem_id_y() #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_y at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_y at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -331,14 +331,14 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_z at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_z at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -651,8 +651,10 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x at gotpcrel32@hi+12
@@ -660,7 +662,6 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0x140
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v0, 10
; GCN-NEXT: v_mov_b32_e32 v1, 20
@@ -693,7 +694,6 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GCN-NEXT: v_mov_b32_e32 v28, 0x122
; GCN-NEXT: v_mov_b32_e32 v29, 0x12c
; GCN-NEXT: v_mov_b32_e32 v30, 0x136
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -729,15 +729,15 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -970,19 +970,20 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_mov_b32_e32 v0, 0x3e7
+; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3e7
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0x140
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33
-; GCN-NEXT: v_writelane_b32 v40, s4, 2
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_byval at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_byval at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_mov_b32_e32 v1, 20
; GCN-NEXT: v_mov_b32_e32 v2, 30
; GCN-NEXT: v_mov_b32_e32 v3, 40
@@ -1013,7 +1014,6 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-NEXT: v_mov_b32_e32 v28, 0x122
; GCN-NEXT: v_mov_b32_e32 v29, 0x12c
; GCN-NEXT: v_mov_b32_e32 v30, 0x136
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mov_b32_e32 v0, 10
@@ -1461,13 +1461,13 @@ define void @func_call_no_workitem_id_hints() #2 {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, extern_hint at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, extern_hint at rel32@hi+12
; GCN-NEXT: v_mov_b32_e32 v0, 9
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
index c63512f630aaf..dca70342077c3 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-entry.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
@@ -37,8 +37,10 @@ define void @caller() #0 {
; CHECK-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-NEXT: s_mov_b32 exec_lo, s1
-; CHECK-NEXT: s_add_co_i32 s32, s32, 16
; CHECK-NEXT: v_writelane_b32 v40, s0, 2
+; CHECK-NEXT: s_add_co_i32 s32, s32, 16
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_getpc_b64 s[0:1]
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-NEXT: s_sext_i32_i16 s1, s1
@@ -46,11 +48,8 @@ define void @caller() #0 {
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-NEXT: s_add_co_ci_u32 s1, s1, entry_fn at gotpcrel32@hi+24
; CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; CHECK-NEXT: v_writelane_b32 v40, s30, 0
-; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_wait_kmcnt 0x0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
index 2c463c5bfebac..d8bf3f3f07f62 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
@@ -112,12 +112,13 @@ define i32 @caller_passes_42() #0 {
; SDAG-NEXT: s_xor_saveexec_b64 s[16:17], -1
; SDAG-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
; SDAG-NEXT: s_mov_b64 exec, s[16:17]
+; SDAG-NEXT: v_writelane_b32 v18, s30, 0
; SDAG-NEXT: s_addk_i32 s32, 0x400
+; SDAG-NEXT: v_writelane_b32 v18, s31, 1
; SDAG-NEXT: s_getpc_b64 s[16:17]
; SDAG-NEXT: s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
; SDAG-NEXT: s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
; SDAG-NEXT: s_load_dwordx2 s[40:41], s[16:17], 0x0
-; SDAG-NEXT: v_writelane_b32 v18, s30, 0
; SDAG-NEXT: s_mov_b32 s16, 42
; SDAG-NEXT: s_mov_b32 s17, 1
; SDAG-NEXT: s_mov_b32 s18, 2
@@ -150,7 +151,6 @@ define i32 @caller_passes_42() #0 {
; SDAG-NEXT: v_mov_b32_e32 v15, 29
; SDAG-NEXT: v_mov_b32_e32 v16, 30
; SDAG-NEXT: v_mov_b32_e32 v17, 31
-; SDAG-NEXT: v_writelane_b32 v18, s31, 1
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: s_swappc_b64 s[30:31], s[40:41]
; SDAG-NEXT: v_readlane_b32 s30, v18, 0
@@ -171,12 +171,13 @@ define i32 @caller_passes_42() #0 {
; GISEL-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GISEL-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[16:17]
+; GISEL-NEXT: v_writelane_b32 v18, s30, 0
; GISEL-NEXT: s_addk_i32 s32, 0x400
+; GISEL-NEXT: v_writelane_b32 v18, s31, 1
; GISEL-NEXT: s_getpc_b64 s[16:17]
; GISEL-NEXT: s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
; GISEL-NEXT: s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
; GISEL-NEXT: s_load_dwordx2 s[40:41], s[16:17], 0x0
-; GISEL-NEXT: v_writelane_b32 v18, s30, 0
; GISEL-NEXT: s_mov_b32 s16, 42
; GISEL-NEXT: s_mov_b32 s17, 1
; GISEL-NEXT: s_mov_b32 s18, 2
@@ -209,7 +210,6 @@ define i32 @caller_passes_42() #0 {
; GISEL-NEXT: v_mov_b32_e32 v15, 29
; GISEL-NEXT: v_mov_b32_e32 v16, 30
; GISEL-NEXT: v_mov_b32_e32 v17, 31
-; GISEL-NEXT: v_writelane_b32 v18, s31, 1
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[40:41]
; GISEL-NEXT: v_readlane_b32 s30, v18, 0
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index b7f3578d06efc..ce930036c8d59 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -378,6 +378,10 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v43, s16, 14
+; GFX9-NEXT: s_addk_i32 s32, 0x800
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v43, s34, 0
; GFX9-NEXT: v_writelane_b32 v43, s35, 1
; GFX9-NEXT: v_writelane_b32 v43, s36, 2
@@ -389,18 +393,14 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: v_writelane_b32 v43, s50, 8
; GFX9-NEXT: v_writelane_b32 v43, s51, 9
; GFX9-NEXT: v_writelane_b32 v43, s52, 10
-; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v43, s53, 11
-; GFX9-NEXT: v_mov_b32_e32 v42, v1
; GFX9-NEXT: v_writelane_b32 v43, s30, 12
+; GFX9-NEXT: v_writelane_b32 v43, s31, 13
+; GFX9-NEXT: v_mov_b32_e32 v42, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v43, s31, 13
; GFX9-NEXT: v_mov_b32_e32 v40, v31
; GFX9-NEXT: v_mov_b32_e32 v41, v2
; GFX9-NEXT: s_mov_b32 s50, s15
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 5ca7a309cadad..1cf69b2530c35 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -33,12 +33,12 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v2f32 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s31, v40, 1
@@ -69,12 +69,12 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v3f32 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32 at rel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s31, v40, 1
@@ -105,12 +105,12 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v4f16 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16 at rel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s31, v40, 1
@@ -141,12 +141,12 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_struct at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_struct at rel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
index 972f897013419..6426eb1ed4aa5 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -212,118 +212,231 @@ define void @callee_need_to_spill_fp_to_memory() #1 {
; GFX900-NEXT: .cfi_register 65, 72
; GFX900-NEXT: s_mov_b32 s33, s32
; GFX900-NEXT: .cfi_def_cfa_register 65
+; GFX900-NEXT: s_addk_i32 s32, 0x7100
; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 28416
; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 28160
; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2602, 32, 17, 64, 27904
; GFX900-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2603, 32, 17, 64, 27648
; GFX900-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2604, 32, 17, 64, 27392
; GFX900-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2605, 32, 17, 64, 27136
; GFX900-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2606, 32, 17, 64, 26880
; GFX900-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2607, 32, 17, 64, 26624
; GFX900-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2616, 32, 17, 64, 26368
; GFX900-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2617, 32, 17, 64, 26112
; GFX900-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2618, 32, 17, 64, 25856
; GFX900-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2619, 32, 17, 64, 25600
; GFX900-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2620, 32, 17, 64, 25344
; GFX900-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2621, 32, 17, 64, 25088
; GFX900-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2622, 32, 17, 64, 24832
; GFX900-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2623, 32, 17, 64, 24576
; GFX900-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2632, 32, 17, 64, 24320
; GFX900-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2633, 32, 17, 64, 24064
; GFX900-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2634, 32, 17, 64, 23808
; GFX900-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2635, 32, 17, 64, 23552
; GFX900-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2636, 32, 17, 64, 23296
; GFX900-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2637, 32, 17, 64, 23040
; GFX900-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2638, 32, 17, 64, 22784
; GFX900-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2639, 32, 17, 64, 22528
; GFX900-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2648, 32, 17, 64, 22272
; GFX900-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2649, 32, 17, 64, 22016
; GFX900-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2650, 32, 17, 64, 21760
; GFX900-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2651, 32, 17, 64, 21504
; GFX900-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2652, 32, 17, 64, 21248
; GFX900-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2653, 32, 17, 64, 20992
; GFX900-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2654, 32, 17, 64, 20736
; GFX900-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2655, 32, 17, 64, 20480
; GFX900-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2664, 32, 17, 64, 20224
; GFX900-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2665, 32, 17, 64, 19968
; GFX900-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2666, 32, 17, 64, 19712
; GFX900-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2667, 32, 17, 64, 19456
; GFX900-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2668, 32, 17, 64, 19200
; GFX900-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2669, 32, 17, 64, 18944
; GFX900-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2670, 32, 17, 64, 18688
; GFX900-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2671, 32, 17, 64, 18432
; GFX900-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2680, 32, 17, 64, 18176
; GFX900-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2681, 32, 17, 64, 17920
; GFX900-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2682, 32, 17, 64, 17664
; GFX900-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2683, 32, 17, 64, 17408
; GFX900-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2684, 32, 17, 64, 17152
; GFX900-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2685, 32, 17, 64, 16896
; GFX900-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2686, 32, 17, 64, 16640
; GFX900-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2687, 32, 17, 64, 16384
; GFX900-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2696, 32, 17, 64, 16128
; GFX900-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2697, 32, 17, 64, 15872
; GFX900-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2698, 32, 17, 64, 15616
; GFX900-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2699, 32, 17, 64, 15360
; GFX900-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2700, 32, 17, 64, 15104
; GFX900-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2701, 32, 17, 64, 14848
; GFX900-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2702, 32, 17, 64, 14592
; GFX900-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2703, 32, 17, 64, 14336
; GFX900-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2712, 32, 17, 64, 14080
; GFX900-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2713, 32, 17, 64, 13824
; GFX900-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2714, 32, 17, 64, 13568
; GFX900-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2715, 32, 17, 64, 13312
; GFX900-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2716, 32, 17, 64, 13056
; GFX900-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2717, 32, 17, 64, 12800
; GFX900-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2718, 32, 17, 64, 12544
; GFX900-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2719, 32, 17, 64, 12288
; GFX900-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2728, 32, 17, 64, 12032
; GFX900-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2729, 32, 17, 64, 11776
; GFX900-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2730, 32, 17, 64, 11520
; GFX900-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2731, 32, 17, 64, 11264
; GFX900-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2732, 32, 17, 64, 11008
; GFX900-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2733, 32, 17, 64, 10752
; GFX900-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2734, 32, 17, 64, 10496
; GFX900-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2735, 32, 17, 64, 10240
; GFX900-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2744, 32, 17, 64, 9984
; GFX900-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2745, 32, 17, 64, 9728
; GFX900-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2746, 32, 17, 64, 9472
; GFX900-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2747, 32, 17, 64, 9216
; GFX900-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2748, 32, 17, 64, 8960
; GFX900-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2749, 32, 17, 64, 8704
; GFX900-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2750, 32, 17, 64, 8448
; GFX900-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2751, 32, 17, 64, 8192
; GFX900-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2760, 32, 17, 64, 7936
; GFX900-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2761, 32, 17, 64, 7680
; GFX900-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2762, 32, 17, 64, 7424
; GFX900-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2763, 32, 17, 64, 7168
; GFX900-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2764, 32, 17, 64, 6912
; GFX900-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2765, 32, 17, 64, 6656
; GFX900-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2766, 32, 17, 64, 6400
; GFX900-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2767, 32, 17, 64, 6144
; GFX900-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2776, 32, 17, 64, 5888
; GFX900-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2777, 32, 17, 64, 5632
; GFX900-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2778, 32, 17, 64, 5376
; GFX900-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2779, 32, 17, 64, 5120
; GFX900-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2780, 32, 17, 64, 4864
; GFX900-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2781, 32, 17, 64, 4608
; GFX900-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2782, 32, 17, 64, 4352
; GFX900-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2783, 32, 17, 64, 4096
; GFX900-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2792, 32, 17, 64, 3840
; GFX900-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2793, 32, 17, 64, 3584
; GFX900-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2794, 32, 17, 64, 3328
; GFX900-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2795, 32, 17, 64, 3072
; GFX900-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2796, 32, 17, 64, 2816
; GFX900-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2797, 32, 17, 64, 2560
; GFX900-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2798, 32, 17, 64, 2304
; GFX900-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2799, 32, 17, 64, 2048
; GFX900-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2808, 32, 17, 64, 1792
; GFX900-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2809, 32, 17, 64, 1536
; GFX900-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2810, 32, 17, 64, 1280
; GFX900-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2811, 32, 17, 64, 1024
; GFX900-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2812, 32, 17, 64, 768
; GFX900-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2813, 32, 17, 64, 512
; GFX900-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2814, 32, 17, 64, 256
; GFX900-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2815, 32, 17, 64, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; clobber nonpreserved SGPRs
; GFX900-NEXT: ;;#ASMEND
@@ -442,7 +555,6 @@ define void @callee_need_to_spill_fp_to_memory() #1 {
; GFX900-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
-; GFX900-NEXT: s_addk_i32 s32, 0x7100
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: .cfi_def_cfa_register 64
; GFX900-NEXT: s_mov_b32 s33, s40
@@ -630,118 +742,231 @@ define void @callee_need_to_spill_fp_to_memory() #1 {
; GFX90A-V2A-DIS-NEXT: .cfi_register 65, 72
; GFX90A-V2A-DIS-NEXT: s_mov_b32 s33, s32
; GFX90A-V2A-DIS-NEXT: .cfi_def_cfa_register 65
+; GFX90A-V2A-DIS-NEXT: s_addk_i32 s32, 0x7100
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 28416
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 28160
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2602, 32, 17, 64, 27904
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2603, 32, 17, 64, 27648
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2604, 32, 17, 64, 27392
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2605, 32, 17, 64, 27136
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2606, 32, 17, 64, 26880
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2607, 32, 17, 64, 26624
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2616, 32, 17, 64, 26368
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2617, 32, 17, 64, 26112
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2618, 32, 17, 64, 25856
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2619, 32, 17, 64, 25600
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2620, 32, 17, 64, 25344
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2621, 32, 17, 64, 25088
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2622, 32, 17, 64, 24832
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2623, 32, 17, 64, 24576
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2632, 32, 17, 64, 24320
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2633, 32, 17, 64, 24064
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2634, 32, 17, 64, 23808
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2635, 32, 17, 64, 23552
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2636, 32, 17, 64, 23296
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2637, 32, 17, 64, 23040
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2638, 32, 17, 64, 22784
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2639, 32, 17, 64, 22528
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2648, 32, 17, 64, 22272
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2649, 32, 17, 64, 22016
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2650, 32, 17, 64, 21760
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2651, 32, 17, 64, 21504
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2652, 32, 17, 64, 21248
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2653, 32, 17, 64, 20992
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2654, 32, 17, 64, 20736
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2655, 32, 17, 64, 20480
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2664, 32, 17, 64, 20224
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2665, 32, 17, 64, 19968
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2666, 32, 17, 64, 19712
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2667, 32, 17, 64, 19456
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2668, 32, 17, 64, 19200
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2669, 32, 17, 64, 18944
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2670, 32, 17, 64, 18688
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2671, 32, 17, 64, 18432
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2680, 32, 17, 64, 18176
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2681, 32, 17, 64, 17920
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2682, 32, 17, 64, 17664
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2683, 32, 17, 64, 17408
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2684, 32, 17, 64, 17152
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2685, 32, 17, 64, 16896
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2686, 32, 17, 64, 16640
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2687, 32, 17, 64, 16384
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2696, 32, 17, 64, 16128
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2697, 32, 17, 64, 15872
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2698, 32, 17, 64, 15616
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2699, 32, 17, 64, 15360
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2700, 32, 17, 64, 15104
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2701, 32, 17, 64, 14848
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2702, 32, 17, 64, 14592
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2703, 32, 17, 64, 14336
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2712, 32, 17, 64, 14080
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2713, 32, 17, 64, 13824
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2714, 32, 17, 64, 13568
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2715, 32, 17, 64, 13312
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2716, 32, 17, 64, 13056
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2717, 32, 17, 64, 12800
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2718, 32, 17, 64, 12544
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2719, 32, 17, 64, 12288
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2728, 32, 17, 64, 12032
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2729, 32, 17, 64, 11776
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2730, 32, 17, 64, 11520
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2731, 32, 17, 64, 11264
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2732, 32, 17, 64, 11008
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2733, 32, 17, 64, 10752
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2734, 32, 17, 64, 10496
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2735, 32, 17, 64, 10240
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2744, 32, 17, 64, 9984
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2745, 32, 17, 64, 9728
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2746, 32, 17, 64, 9472
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2747, 32, 17, 64, 9216
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2748, 32, 17, 64, 8960
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2749, 32, 17, 64, 8704
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2750, 32, 17, 64, 8448
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2751, 32, 17, 64, 8192
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2760, 32, 17, 64, 7936
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2761, 32, 17, 64, 7680
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2762, 32, 17, 64, 7424
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2763, 32, 17, 64, 7168
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2764, 32, 17, 64, 6912
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2765, 32, 17, 64, 6656
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2766, 32, 17, 64, 6400
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2767, 32, 17, 64, 6144
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2776, 32, 17, 64, 5888
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2777, 32, 17, 64, 5632
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2778, 32, 17, 64, 5376
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2779, 32, 17, 64, 5120
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2780, 32, 17, 64, 4864
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2781, 32, 17, 64, 4608
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2782, 32, 17, 64, 4352
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2783, 32, 17, 64, 4096
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2792, 32, 17, 64, 3840
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2793, 32, 17, 64, 3584
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2794, 32, 17, 64, 3328
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2795, 32, 17, 64, 3072
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2796, 32, 17, 64, 2816
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2797, 32, 17, 64, 2560
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2798, 32, 17, 64, 2304
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2799, 32, 17, 64, 2048
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2808, 32, 17, 64, 1792
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2809, 32, 17, 64, 1536
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2810, 32, 17, 64, 1280
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2811, 32, 17, 64, 1024
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2812, 32, 17, 64, 768
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2813, 32, 17, 64, 512
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2814, 32, 17, 64, 256
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2815, 32, 17, 64, 0
; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART
; GFX90A-V2A-DIS-NEXT: ; clobber nonpreserved SGPRs
; GFX90A-V2A-DIS-NEXT: ;;#ASMEND
@@ -860,7 +1085,6 @@ define void @callee_need_to_spill_fp_to_memory() #1 {
; GFX90A-V2A-DIS-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
; GFX90A-V2A-DIS-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
; GFX90A-V2A-DIS-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
-; GFX90A-V2A-DIS-NEXT: s_addk_i32 s32, 0x7100
; GFX90A-V2A-DIS-NEXT: s_mov_b32 s32, s33
; GFX90A-V2A-DIS-NEXT: .cfi_def_cfa_register 64
; GFX90A-V2A-DIS-NEXT: s_mov_b32 s33, s40
@@ -1080,118 +1304,231 @@ define void @callee_need_to_spill_fp_to_memory() #1 {
; GFX90A-V2A-EN-NEXT: .cfi_register 65, 72
; GFX90A-V2A-EN-NEXT: s_mov_b32 s33, s32
; GFX90A-V2A-EN-NEXT: .cfi_def_cfa_register 65
+; GFX90A-V2A-EN-NEXT: s_addk_i32 s32, 0x5100
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2600, 3072, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2601, 3073, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2602, 3074, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2603, 3075, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2604, 3076, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2605, 3077, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2606, 3078, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2607, 3079, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2616, 3080, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2617, 3081, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2618, 3082, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2619, 3083, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2620, 3084, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2621, 3085, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2622, 3086, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2623, 3087, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a16, v72 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2632, 3088, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a17, v73 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2633, 3089, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a18, v74 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2634, 3090, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a19, v75 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2635, 3091, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a20, v76 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2636, 3092, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a21, v77 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2637, 3093, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a22, v78 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2638, 3094, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a23, v79 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2639, 3095, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a24, v88 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2648, 3096, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a25, v89 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2649, 3097, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a26, v90 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2650, 3098, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a27, v91 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2651, 3099, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a28, v92 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2652, 3100, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a29, v93 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2653, 3101, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a30, v94 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2654, 3102, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a31, v95 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2655, 3103, 32, 17, 64
; GFX90A-V2A-EN-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2664, 32, 17, 64, 20224
; GFX90A-V2A-EN-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2665, 32, 17, 64, 19968
; GFX90A-V2A-EN-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2666, 32, 17, 64, 19712
; GFX90A-V2A-EN-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2667, 32, 17, 64, 19456
; GFX90A-V2A-EN-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2668, 32, 17, 64, 19200
; GFX90A-V2A-EN-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2669, 32, 17, 64, 18944
; GFX90A-V2A-EN-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2670, 32, 17, 64, 18688
; GFX90A-V2A-EN-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2671, 32, 17, 64, 18432
; GFX90A-V2A-EN-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2680, 32, 17, 64, 18176
; GFX90A-V2A-EN-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2681, 32, 17, 64, 17920
; GFX90A-V2A-EN-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2682, 32, 17, 64, 17664
; GFX90A-V2A-EN-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2683, 32, 17, 64, 17408
; GFX90A-V2A-EN-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2684, 32, 17, 64, 17152
; GFX90A-V2A-EN-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2685, 32, 17, 64, 16896
; GFX90A-V2A-EN-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2686, 32, 17, 64, 16640
; GFX90A-V2A-EN-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2687, 32, 17, 64, 16384
; GFX90A-V2A-EN-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2696, 32, 17, 64, 16128
; GFX90A-V2A-EN-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2697, 32, 17, 64, 15872
; GFX90A-V2A-EN-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2698, 32, 17, 64, 15616
; GFX90A-V2A-EN-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2699, 32, 17, 64, 15360
; GFX90A-V2A-EN-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2700, 32, 17, 64, 15104
; GFX90A-V2A-EN-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2701, 32, 17, 64, 14848
; GFX90A-V2A-EN-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2702, 32, 17, 64, 14592
; GFX90A-V2A-EN-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2703, 32, 17, 64, 14336
; GFX90A-V2A-EN-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2712, 32, 17, 64, 14080
; GFX90A-V2A-EN-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2713, 32, 17, 64, 13824
; GFX90A-V2A-EN-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2714, 32, 17, 64, 13568
; GFX90A-V2A-EN-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2715, 32, 17, 64, 13312
; GFX90A-V2A-EN-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2716, 32, 17, 64, 13056
; GFX90A-V2A-EN-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2717, 32, 17, 64, 12800
; GFX90A-V2A-EN-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2718, 32, 17, 64, 12544
; GFX90A-V2A-EN-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2719, 32, 17, 64, 12288
; GFX90A-V2A-EN-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2728, 32, 17, 64, 12032
; GFX90A-V2A-EN-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2729, 32, 17, 64, 11776
; GFX90A-V2A-EN-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2730, 32, 17, 64, 11520
; GFX90A-V2A-EN-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2731, 32, 17, 64, 11264
; GFX90A-V2A-EN-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2732, 32, 17, 64, 11008
; GFX90A-V2A-EN-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2733, 32, 17, 64, 10752
; GFX90A-V2A-EN-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2734, 32, 17, 64, 10496
; GFX90A-V2A-EN-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2735, 32, 17, 64, 10240
; GFX90A-V2A-EN-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2744, 32, 17, 64, 9984
; GFX90A-V2A-EN-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2745, 32, 17, 64, 9728
; GFX90A-V2A-EN-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2746, 32, 17, 64, 9472
; GFX90A-V2A-EN-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2747, 32, 17, 64, 9216
; GFX90A-V2A-EN-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2748, 32, 17, 64, 8960
; GFX90A-V2A-EN-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2749, 32, 17, 64, 8704
; GFX90A-V2A-EN-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2750, 32, 17, 64, 8448
; GFX90A-V2A-EN-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2751, 32, 17, 64, 8192
; GFX90A-V2A-EN-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2760, 32, 17, 64, 7936
; GFX90A-V2A-EN-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2761, 32, 17, 64, 7680
; GFX90A-V2A-EN-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2762, 32, 17, 64, 7424
; GFX90A-V2A-EN-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2763, 32, 17, 64, 7168
; GFX90A-V2A-EN-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2764, 32, 17, 64, 6912
; GFX90A-V2A-EN-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2765, 32, 17, 64, 6656
; GFX90A-V2A-EN-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2766, 32, 17, 64, 6400
; GFX90A-V2A-EN-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2767, 32, 17, 64, 6144
; GFX90A-V2A-EN-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2776, 32, 17, 64, 5888
; GFX90A-V2A-EN-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2777, 32, 17, 64, 5632
; GFX90A-V2A-EN-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2778, 32, 17, 64, 5376
; GFX90A-V2A-EN-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2779, 32, 17, 64, 5120
; GFX90A-V2A-EN-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2780, 32, 17, 64, 4864
; GFX90A-V2A-EN-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2781, 32, 17, 64, 4608
; GFX90A-V2A-EN-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2782, 32, 17, 64, 4352
; GFX90A-V2A-EN-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2783, 32, 17, 64, 4096
; GFX90A-V2A-EN-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2792, 32, 17, 64, 3840
; GFX90A-V2A-EN-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2793, 32, 17, 64, 3584
; GFX90A-V2A-EN-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2794, 32, 17, 64, 3328
; GFX90A-V2A-EN-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2795, 32, 17, 64, 3072
; GFX90A-V2A-EN-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2796, 32, 17, 64, 2816
; GFX90A-V2A-EN-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2797, 32, 17, 64, 2560
; GFX90A-V2A-EN-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2798, 32, 17, 64, 2304
; GFX90A-V2A-EN-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2799, 32, 17, 64, 2048
; GFX90A-V2A-EN-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2808, 32, 17, 64, 1792
; GFX90A-V2A-EN-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2809, 32, 17, 64, 1536
; GFX90A-V2A-EN-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2810, 32, 17, 64, 1280
; GFX90A-V2A-EN-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2811, 32, 17, 64, 1024
; GFX90A-V2A-EN-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2812, 32, 17, 64, 768
; GFX90A-V2A-EN-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2813, 32, 17, 64, 512
; GFX90A-V2A-EN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2814, 32, 17, 64, 256
; GFX90A-V2A-EN-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2815, 32, 17, 64, 0
; GFX90A-V2A-EN-NEXT: ;;#ASMSTART
; GFX90A-V2A-EN-NEXT: ; clobber nonpreserved SGPRs
; GFX90A-V2A-EN-NEXT: ;;#ASMEND
@@ -1278,7 +1615,6 @@ define void @callee_need_to_spill_fp_to_memory() #1 {
; GFX90A-V2A-EN-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Reload
; GFX90A-V2A-EN-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Reload
; GFX90A-V2A-EN-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Reload
-; GFX90A-V2A-EN-NEXT: s_addk_i32 s32, 0x5100
; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v95, a31 ; Reload Reuse
; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v94, a30 ; Reload Reuse
; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v93, a29 ; Reload Reuse
@@ -1498,118 +1834,231 @@ define void @callee_need_to_spill_fp_to_memory() #1 {
; WAVE32-NEXT: .cfi_register 65, 72
; WAVE32-NEXT: s_mov_b32 s33, s32
; WAVE32-NEXT: .cfi_def_cfa_register 65
+; WAVE32-NEXT: s_addk_i32 s32, 0x3880
; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 14208
; WAVE32-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1577, 32, 1, 32, 14080
; WAVE32-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1578, 32, 1, 32, 13952
; WAVE32-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1579, 32, 1, 32, 13824
; WAVE32-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1580, 32, 1, 32, 13696
; WAVE32-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1581, 32, 1, 32, 13568
; WAVE32-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1582, 32, 1, 32, 13440
; WAVE32-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1583, 32, 1, 32, 13312
; WAVE32-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1592, 32, 1, 32, 13184
; WAVE32-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1593, 32, 1, 32, 13056
; WAVE32-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1594, 32, 1, 32, 12928
; WAVE32-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1595, 32, 1, 32, 12800
; WAVE32-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1596, 32, 1, 32, 12672
; WAVE32-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1597, 32, 1, 32, 12544
; WAVE32-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1598, 32, 1, 32, 12416
; WAVE32-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1599, 32, 1, 32, 12288
; WAVE32-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1608, 32, 1, 32, 12160
; WAVE32-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1609, 32, 1, 32, 12032
; WAVE32-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1610, 32, 1, 32, 11904
; WAVE32-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1611, 32, 1, 32, 11776
; WAVE32-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1612, 32, 1, 32, 11648
; WAVE32-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1613, 32, 1, 32, 11520
; WAVE32-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1614, 32, 1, 32, 11392
; WAVE32-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1615, 32, 1, 32, 11264
; WAVE32-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1624, 32, 1, 32, 11136
; WAVE32-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1625, 32, 1, 32, 11008
; WAVE32-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1626, 32, 1, 32, 10880
; WAVE32-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1627, 32, 1, 32, 10752
; WAVE32-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1628, 32, 1, 32, 10624
; WAVE32-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1629, 32, 1, 32, 10496
; WAVE32-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1630, 32, 1, 32, 10368
; WAVE32-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1631, 32, 1, 32, 10240
; WAVE32-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1640, 32, 1, 32, 10112
; WAVE32-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1641, 32, 1, 32, 9984
; WAVE32-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1642, 32, 1, 32, 9856
; WAVE32-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1643, 32, 1, 32, 9728
; WAVE32-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1644, 32, 1, 32, 9600
; WAVE32-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1645, 32, 1, 32, 9472
; WAVE32-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1646, 32, 1, 32, 9344
; WAVE32-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1647, 32, 1, 32, 9216
; WAVE32-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1656, 32, 1, 32, 9088
; WAVE32-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1657, 32, 1, 32, 8960
; WAVE32-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1658, 32, 1, 32, 8832
; WAVE32-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1659, 32, 1, 32, 8704
; WAVE32-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1660, 32, 1, 32, 8576
; WAVE32-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1661, 32, 1, 32, 8448
; WAVE32-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1662, 32, 1, 32, 8320
; WAVE32-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1663, 32, 1, 32, 8192
; WAVE32-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1672, 32, 1, 32, 8064
; WAVE32-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1673, 32, 1, 32, 7936
; WAVE32-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1674, 32, 1, 32, 7808
; WAVE32-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1675, 32, 1, 32, 7680
; WAVE32-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1676, 32, 1, 32, 7552
; WAVE32-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1677, 32, 1, 32, 7424
; WAVE32-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1678, 32, 1, 32, 7296
; WAVE32-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1679, 32, 1, 32, 7168
; WAVE32-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1688, 32, 1, 32, 7040
; WAVE32-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1689, 32, 1, 32, 6912
; WAVE32-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1690, 32, 1, 32, 6784
; WAVE32-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1691, 32, 1, 32, 6656
; WAVE32-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1692, 32, 1, 32, 6528
; WAVE32-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1693, 32, 1, 32, 6400
; WAVE32-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1694, 32, 1, 32, 6272
; WAVE32-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1695, 32, 1, 32, 6144
; WAVE32-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1704, 32, 1, 32, 6016
; WAVE32-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1705, 32, 1, 32, 5888
; WAVE32-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1706, 32, 1, 32, 5760
; WAVE32-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1707, 32, 1, 32, 5632
; WAVE32-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1708, 32, 1, 32, 5504
; WAVE32-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1709, 32, 1, 32, 5376
; WAVE32-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1710, 32, 1, 32, 5248
; WAVE32-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1711, 32, 1, 32, 5120
; WAVE32-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1720, 32, 1, 32, 4992
; WAVE32-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1721, 32, 1, 32, 4864
; WAVE32-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1722, 32, 1, 32, 4736
; WAVE32-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1723, 32, 1, 32, 4608
; WAVE32-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1724, 32, 1, 32, 4480
; WAVE32-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1725, 32, 1, 32, 4352
; WAVE32-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1726, 32, 1, 32, 4224
; WAVE32-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1727, 32, 1, 32, 4096
; WAVE32-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1736, 32, 1, 32, 3968
; WAVE32-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1737, 32, 1, 32, 3840
; WAVE32-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1738, 32, 1, 32, 3712
; WAVE32-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1739, 32, 1, 32, 3584
; WAVE32-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1740, 32, 1, 32, 3456
; WAVE32-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1741, 32, 1, 32, 3328
; WAVE32-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1742, 32, 1, 32, 3200
; WAVE32-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1743, 32, 1, 32, 3072
; WAVE32-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1752, 32, 1, 32, 2944
; WAVE32-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1753, 32, 1, 32, 2816
; WAVE32-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1754, 32, 1, 32, 2688
; WAVE32-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1755, 32, 1, 32, 2560
; WAVE32-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1756, 32, 1, 32, 2432
; WAVE32-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1757, 32, 1, 32, 2304
; WAVE32-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1758, 32, 1, 32, 2176
; WAVE32-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1759, 32, 1, 32, 2048
; WAVE32-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1768, 32, 1, 32, 1920
; WAVE32-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1769, 32, 1, 32, 1792
; WAVE32-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1770, 32, 1, 32, 1664
; WAVE32-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1771, 32, 1, 32, 1536
; WAVE32-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1772, 32, 1, 32, 1408
; WAVE32-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1773, 32, 1, 32, 1280
; WAVE32-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1774, 32, 1, 32, 1152
; WAVE32-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1775, 32, 1, 32, 1024
; WAVE32-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1784, 32, 1, 32, 896
; WAVE32-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1785, 32, 1, 32, 768
; WAVE32-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1786, 32, 1, 32, 640
; WAVE32-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1787, 32, 1, 32, 512
; WAVE32-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1788, 32, 1, 32, 384
; WAVE32-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1789, 32, 1, 32, 256
; WAVE32-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1790, 32, 1, 32, 128
; WAVE32-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1791, 32, 1, 32, 0
; WAVE32-NEXT: ;;#ASMSTART
; WAVE32-NEXT: ; clobber nonpreserved SGPRs
; WAVE32-NEXT: ;;#ASMEND
@@ -1730,7 +2179,6 @@ define void @callee_need_to_spill_fp_to_memory() #1 {
; WAVE32-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436
; WAVE32-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440
; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444
-; WAVE32-NEXT: s_addk_i32 s32, 0x3880
; WAVE32-NEXT: s_mov_b32 s32, s33
; WAVE32-NEXT: .cfi_def_cfa_register 64
; WAVE32-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
@@ -1998,12 +2446,13 @@ define hidden void @func_call_clobber() #0 {
; GFX900-NEXT: v_writelane_b32 v40, s16, 2
; GFX900-NEXT: .cfi_llvm_vector_registers 65, 2600, 2, 32
; GFX900-NEXT: .cfi_def_cfa_register 65
-; GFX900-NEXT: s_addk_i32 s32, 0x400
; GFX900-NEXT: v_writelane_b32 v40, s30, 0
+; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: v_writelane_b32 v40, s31, 1
+; GFX900-NEXT: .cfi_llvm_vector_registers 16, 2815, 0, 32, 2815, 1, 32
; GFX900-NEXT: s_getpc_b64 s[16:17]
; GFX900-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
; GFX900-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
-; GFX900-NEXT: v_writelane_b32 v40, s31, 1
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX900-NEXT: v_readlane_b32 s30, v40, 0
; GFX900-NEXT: v_readlane_b32 s31, v40, 1
@@ -2271,12 +2720,13 @@ define hidden void @func_call_clobber() #0 {
; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s16, 2
; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_registers 65, 2600, 2, 32
; GFX90A-V2A-DIS-NEXT: .cfi_def_cfa_register 65
-; GFX90A-V2A-DIS-NEXT: s_addk_i32 s32, 0x400
; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s30, 0
+; GFX90A-V2A-DIS-NEXT: s_addk_i32 s32, 0x400
+; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s31, 1
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_registers 16, 2815, 0, 32, 2815, 1, 32
; GFX90A-V2A-DIS-NEXT: s_getpc_b64 s[16:17]
; GFX90A-V2A-DIS-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
; GFX90A-V2A-DIS-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
-; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-V2A-DIS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s31, v40, 1
@@ -2544,12 +2994,13 @@ define hidden void @func_call_clobber() #0 {
; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s16, 2
; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_registers 65, 2600, 2, 32
; GFX90A-V2A-EN-NEXT: .cfi_def_cfa_register 65
-; GFX90A-V2A-EN-NEXT: s_addk_i32 s32, 0x400
; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s30, 0
+; GFX90A-V2A-EN-NEXT: s_addk_i32 s32, 0x400
+; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s31, 1
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_registers 16, 2815, 0, 32, 2815, 1, 32
; GFX90A-V2A-EN-NEXT: s_getpc_b64 s[16:17]
; GFX90A-V2A-EN-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
; GFX90A-V2A-EN-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
-; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-V2A-EN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX90A-V2A-EN-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-V2A-EN-NEXT: v_readlane_b32 s31, v40, 1
@@ -2788,10 +3239,11 @@ define hidden void @func_call_clobber() #0 {
; WAVE32-NEXT: .cfi_def_cfa_register 65
; WAVE32-NEXT: v_writelane_b32 v40, s30, 0
; WAVE32-NEXT: s_addk_i32 s32, 0x200
+; WAVE32-NEXT: v_writelane_b32 v40, s31, 1
+; WAVE32-NEXT: .cfi_llvm_vector_registers 16, 1791, 0, 32, 1791, 1, 32
; WAVE32-NEXT: s_getpc_b64 s[16:17]
; WAVE32-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
; WAVE32-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
-; WAVE32-NEXT: v_writelane_b32 v40, s31, 1
; WAVE32-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE32-NEXT: v_readlane_b32 s30, v40, 0
; WAVE32-NEXT: v_readlane_b32 s31, v40, 1
@@ -2819,7 +3271,9 @@ define hidden void @func_spill_vgpr_to_vmem() #0 {
; GFX900-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 256
; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; clobber
; GFX900-NEXT: ;;#ASMEND
@@ -2845,9 +3299,13 @@ define hidden void @func_spill_vgpr_to_vmem() #0 {
; GFX90A-V2A-DIS-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32
; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 768
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 512
; GFX90A-V2A-DIS-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3104, 32, 17, 64, 256
; GFX90A-V2A-DIS-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3105, 32, 17, 64, 0
; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART
; GFX90A-V2A-DIS-NEXT: ; clobber
; GFX90A-V2A-DIS-NEXT: ;;#ASMEND
@@ -2879,9 +3337,13 @@ define hidden void @func_spill_vgpr_to_vmem() #0 {
; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073
; GFX90A-V2A-EN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2600, 3072, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2601, 3073, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v0, a32 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 3104, 2560, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v1, a33 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 3105, 2561, 32, 17, 64
; GFX90A-V2A-EN-NEXT: ;;#ASMSTART
; GFX90A-V2A-EN-NEXT: ; clobber
; GFX90A-V2A-EN-NEXT: ;;#ASMEND
@@ -2908,7 +3370,9 @@ define hidden void @func_spill_vgpr_to_vmem() #0 {
; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32
; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 128
; WAVE32-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1577, 32, 1, 32, 0
; WAVE32-NEXT: ;;#ASMSTART
; WAVE32-NEXT: ; clobber
; WAVE32-NEXT: ;;#ASMEND
@@ -2943,7 +3407,9 @@ define hidden void @func_spill_vgpr_to_agpr() #2 {
; GFX900-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 256
; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX900-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; clobber
; GFX900-NEXT: ;;#ASMEND
@@ -2969,9 +3435,13 @@ define hidden void @func_spill_vgpr_to_agpr() #2 {
; GFX90A-V2A-DIS-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32
; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 768
; GFX90A-V2A-DIS-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 512
; GFX90A-V2A-DIS-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3104, 32, 17, 64, 256
; GFX90A-V2A-DIS-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3105, 32, 17, 64, 0
; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART
; GFX90A-V2A-DIS-NEXT: ; clobber
; GFX90A-V2A-DIS-NEXT: ;;#ASMEND
@@ -3003,9 +3473,13 @@ define hidden void @func_spill_vgpr_to_agpr() #2 {
; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073
; GFX90A-V2A-EN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2600, 3072, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2601, 3073, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v0, a32 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 3104, 2560, 32, 17, 64
; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v1, a33 ; Reload Reuse
+; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 3105, 2561, 32, 17, 64
; GFX90A-V2A-EN-NEXT: ;;#ASMSTART
; GFX90A-V2A-EN-NEXT: ; clobber
; GFX90A-V2A-EN-NEXT: ;;#ASMEND
@@ -3032,7 +3506,9 @@ define hidden void @func_spill_vgpr_to_agpr() #2 {
; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32
; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 128
; WAVE32-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-NEXT: .cfi_llvm_vector_offset 1577, 32, 1, 32, 0
; WAVE32-NEXT: ;;#ASMSTART
; WAVE32-NEXT: ; clobber
; WAVE32-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index fb2da9a5b934c..0ba37bdf2dd8c 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -489,20 +489,40 @@ define weak_odr void @test(i32 %0) #1 !dbg !34 {
; CHECK-NEXT: v_writelane_b32 v41, s16, 16
; CHECK-NEXT: .cfi_llvm_vector_registers 65, 2601, 16, 32
; CHECK-NEXT: .cfi_def_cfa_register 65
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 0
; CHECK-NEXT: v_writelane_b32 v41, s34, 0
+; CHECK-NEXT: .cfi_llvm_vector_registers 66, 2622, 0, 32
; CHECK-NEXT: v_writelane_b32 v41, s35, 1
+; CHECK-NEXT: .cfi_llvm_vector_registers 67, 2622, 1, 32
; CHECK-NEXT: v_writelane_b32 v41, s36, 2
+; CHECK-NEXT: .cfi_llvm_vector_registers 68, 2622, 2, 32
; CHECK-NEXT: v_writelane_b32 v41, s37, 3
+; CHECK-NEXT: .cfi_llvm_vector_registers 69, 2622, 3, 32
; CHECK-NEXT: v_writelane_b32 v41, s38, 4
+; CHECK-NEXT: .cfi_llvm_vector_registers 70, 2622, 4, 32
; CHECK-NEXT: v_writelane_b32 v41, s39, 5
+; CHECK-NEXT: .cfi_llvm_vector_registers 71, 2622, 5, 32
; CHECK-NEXT: v_writelane_b32 v41, s48, 6
+; CHECK-NEXT: .cfi_llvm_vector_registers 80, 2622, 6, 32
; CHECK-NEXT: v_writelane_b32 v41, s49, 7
+; CHECK-NEXT: .cfi_llvm_vector_registers 81, 2622, 7, 32
; CHECK-NEXT: v_writelane_b32 v41, s50, 8
+; CHECK-NEXT: .cfi_llvm_vector_registers 82, 2622, 8, 32
; CHECK-NEXT: v_writelane_b32 v41, s51, 9
+; CHECK-NEXT: .cfi_llvm_vector_registers 83, 2622, 9, 32
; CHECK-NEXT: v_writelane_b32 v41, s52, 10
-; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: .cfi_llvm_vector_registers 84, 2622, 10, 32
; CHECK-NEXT: v_writelane_b32 v41, s53, 11
+; CHECK-NEXT: .cfi_llvm_vector_registers 85, 2622, 11, 32
; CHECK-NEXT: v_writelane_b32 v41, s54, 12
+; CHECK-NEXT: .cfi_llvm_vector_registers 86, 2622, 12, 32
+; CHECK-NEXT: v_writelane_b32 v41, s55, 13
+; CHECK-NEXT: .cfi_llvm_vector_registers 87, 2622, 13, 32
+; CHECK-NEXT: v_writelane_b32 v41, s30, 14
+; CHECK-NEXT: v_writelane_b32 v41, s31, 15
+; CHECK-NEXT: .cfi_llvm_vector_registers 16, 2622, 14, 32, 2622, 15, 32
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef
; CHECK-NEXT: .Ltmp0:
@@ -510,12 +530,8 @@ define weak_odr void @test(i32 %0) #1 !dbg !34 {
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared at gotpcrel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v41, s55, 13
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v41, s30, 14
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v41, s31, 15
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index fbacc61492674..cd0c88a13c46a 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -286,19 +286,18 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
; CHECK-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-TRUE16-NEXT: s_mov_b32 exec_lo, s1
; CHECK-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
+; CHECK-TRUE16-NEXT: s_add_co_i32 s32, s32, 16
+; CHECK-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-TRUE16-NEXT: v_mov_b16_e32 v0.l, 15
; CHECK-TRUE16-NEXT: s_mov_b32 s1, callee at abs32@hi
; CHECK-TRUE16-NEXT: s_mov_b32 s0, callee at abs32@lo
-; CHECK-TRUE16-NEXT: s_add_co_i32 s32, s32, 16
-; CHECK-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-TRUE16-NEXT: s_wait_storecnt 0x0
; CHECK-TRUE16-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
; CHECK-TRUE16-NEXT: s_wait_storecnt 0x0
; CHECK-TRUE16-NEXT: v_mov_b32_e32 v0, 0x47
-; CHECK-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; CHECK-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -326,19 +325,18 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
; CHECK-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-FAKE16-NEXT: s_mov_b32 exec_lo, s1
; CHECK-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
+; CHECK-FAKE16-NEXT: s_add_co_i32 s32, s32, 16
+; CHECK-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-FAKE16-NEXT: v_mov_b32_e32 v0, 15
; CHECK-FAKE16-NEXT: s_mov_b32 s1, callee at abs32@hi
; CHECK-FAKE16-NEXT: s_mov_b32 s0, callee at abs32@lo
-; CHECK-FAKE16-NEXT: s_add_co_i32 s32, s32, 16
-; CHECK-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-FAKE16-NEXT: s_wait_storecnt 0x0
; CHECK-FAKE16-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
; CHECK-FAKE16-NEXT: s_wait_storecnt 0x0
; CHECK-FAKE16-NEXT: v_mov_b32_e32 v0, 0x47
-; CHECK-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-FAKE16-NEXT: s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index 516b2a53c85d5..e71ee5d40e05f 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -1650,21 +1650,37 @@ body: |
; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 3840
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 3584
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 3328
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 3072
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec, 64, 2816
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 2560
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec, 64, 2304
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec, 64, 2048
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec, 64, 1792
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec, 64, 1536
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec, 64, 1280
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec, 64, 1024
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec, 64, 768
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec, 64, 512
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec, 64, 256
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.17, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec, 64, 0
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1762,21 +1778,37 @@ body: |
; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 3840
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 3584
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 3328
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 3072
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec, 64, 2816
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 2560
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec, 64, 2304
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec, 64, 2048
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec, 64, 1792
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec, 64, 1536
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec, 64, 1280
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec, 64, 1024
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec, 64, 768
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec, 64, 512
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec, 64, 256
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.17, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec, 64, 0
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1889,21 +1921,37 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr15
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr40, $agpr0, 32, $exec, 64
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr41, $agpr1, 32, $exec, 64
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr42, $agpr2, 32, $exec, 64
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr43, $agpr3, 32, $exec, 64
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr44, $agpr4, 32, $exec, 64
; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr45, $agpr5, 32, $exec, 64
; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr46, $agpr6, 32, $exec, 64
; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr47, $agpr7, 32, $exec, 64
; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr56, $agpr8, 32, $exec, 64
; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr57, $agpr9, 32, $exec, 64
; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr58, $agpr10, 32, $exec, 64
; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr59, $agpr11, 32, $exec, 64
; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr60, $agpr12, 32, $exec, 64
; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr61, $agpr13, 32, $exec, 64
; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr62, $agpr14, 32, $exec, 64
; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr63, $agpr15, 32, $exec, 64
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -2000,21 +2048,37 @@ body: |
; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec_lo, 32, 1920
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec_lo, 32, 1792
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec_lo, 32, 1664
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec_lo, 32, 1536
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec_lo, 32, 1408
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec_lo, 32, 1280
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec_lo, 32, 1152
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec_lo, 32, 1024
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec_lo, 32, 896
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec_lo, 32, 768
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec_lo, 32, 640
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec_lo, 32, 512
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec_lo, 32, 384
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec_lo, 32, 256
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec_lo, 32, 128
; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.17, addrspace 5)
+ ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec_lo, 32, 0
; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -2109,21 +2173,37 @@ body: |
; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec_lo, 32, 1920
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec_lo, 32, 1792
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec_lo, 32, 1664
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr43, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec_lo, 32, 1536
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec_lo, 32, 1408
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr45, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec_lo, 32, 1280
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr46, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec_lo, 32, 1152
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr47, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec_lo, 32, 1024
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr56, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec_lo, 32, 896
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr57, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec_lo, 32, 768
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr58, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec_lo, 32, 640
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr59, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec_lo, 32, 512
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr60, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec_lo, 32, 384
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr61, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec_lo, 32, 256
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr62, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec_lo, 32, 128
; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr63, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.17, addrspace 5)
+ ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec_lo, 32, 0
; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -2219,21 +2299,37 @@ body: |
; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec_lo, 32, 1920
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec_lo, 32, 1792
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec_lo, 32, 1664
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr43, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec_lo, 32, 1536
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec_lo, 32, 1408
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr45, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec_lo, 32, 1280
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr46, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec_lo, 32, 1152
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr47, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec_lo, 32, 1024
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr56, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec_lo, 32, 896
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr57, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec_lo, 32, 768
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr58, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec_lo, 32, 640
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr59, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec_lo, 32, 512
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr60, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec_lo, 32, 384
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr61, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec_lo, 32, 256
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr62, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec_lo, 32, 128
; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr63, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.17, addrspace 5)
+ ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec_lo, 32, 0
; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
index 2d620a14da405..06e85bf02a9e1 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
@@ -18,40 +18,40 @@ define void @wobble() #0 {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 exec_lo, s17
; CHECK-NEXT: v_writelane_b32 v43, s16, 15
+; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_mov_b32_e32 v40, v31
-; CHECK-NEXT: v_mov_b32_e32 v41, 0
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v43, s34, 0
; CHECK-NEXT: v_writelane_b32 v43, s35, 1
-; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: v_writelane_b32 v43, s36, 2
; CHECK-NEXT: v_writelane_b32 v43, s37, 3
-; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base
-; CHECK-NEXT: v_mov_b32_e32 v42, s9
; CHECK-NEXT: v_writelane_b32 v43, s38, 4
; CHECK-NEXT: v_writelane_b32 v43, s39, 5
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_writelane_b32 v43, s48, 6
; CHECK-NEXT: v_writelane_b32 v43, s49, 7
-; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT: s_lshr_b32 s5, s33, 5
-; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: v_writelane_b32 v43, s50, 8
-; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: v_writelane_b32 v43, s51, 9
-; CHECK-NEXT: s_mov_b32 s51, s14
; CHECK-NEXT: v_writelane_b32 v43, s52, 10
-; CHECK-NEXT: s_mov_b32 s52, s13
; CHECK-NEXT: v_writelane_b32 v43, s53, 11
-; CHECK-NEXT: s_mov_b32 s53, s12
; CHECK-NEXT: v_writelane_b32 v43, s54, 12
-; CHECK-NEXT: s_add_i32 s54, s5, 16
; CHECK-NEXT: v_writelane_b32 v43, s30, 13
; CHECK-NEXT: v_writelane_b32 v43, s31, 14
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base
+; CHECK-NEXT: v_mov_b32_e32 v40, v31
+; CHECK-NEXT: v_mov_b32_e32 v41, 0
+; CHECK-NEXT: v_mov_b32_e32 v42, s9
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_lshr_b32 s5, s33, 5
+; CHECK-NEXT: s_mov_b32 s50, s15
+; CHECK-NEXT: s_mov_b32 s51, s14
+; CHECK-NEXT: s_mov_b32 s52, s13
+; CHECK-NEXT: s_mov_b32 s53, s12
+; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_add_i32 s54, s5, 16
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_1: ; %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.mir
index de4a3cde9fd14..210040b8c3da9 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.mir
@@ -23,6 +23,7 @@ body: |
; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr5
; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr6
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec_lo, 32, 0
; CHECK-NEXT: renamable $vgpr41 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: renamable $sgpr4 = S_MOV_B32 0
; CHECK-NEXT: renamable $sgpr5 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index cba5aa8ef3672..f5832e6f307fd 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -22,13 +22,14 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: v_writelane_b32 v42, s34, 3
; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_addk_i32 s32, 0x3000
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: v_writelane_b32 v42, s30, 0
+; GCN-NEXT: v_writelane_b32 v42, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, extern_func at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, extern_func at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v42, s30, 0
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:88
@@ -46,7 +47,6 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v8
-; GCN-NEXT: v_writelane_b32 v42, s31, 1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
index 2a3e2be34586f..dae03b2ff860e 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -455,7 +455,7 @@ body: |
liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
- ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+ ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -501,21 +501,37 @@ body: |
; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 3840
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 3584
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 3328
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 3072
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec, 64, 2816
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 2560
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec, 64, 2304
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec, 64, 2048
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec, 64, 1792
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec, 64, 1536
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec, 64, 1280
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec, 64, 1024
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec, 64, 768
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec, 64, 512
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec, 64, 256
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec, 64, 0
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -558,7 +574,7 @@ body: |
; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
;
; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
- ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+ ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
; GFX900-NEXT: {{ $}}
; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6
; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32
@@ -604,21 +620,37 @@ body: |
; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 3840
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 3584
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 3328
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 3072
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec, 64, 2816
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 2560
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec, 64, 2304
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec, 64, 2048
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec, 64, 1792
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec, 64, 1536
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec, 64, 1280
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec, 64, 1024
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec, 64, 768
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec, 64, 512
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec, 64, 256
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec, 64, 0
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -722,21 +754,37 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr15
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr40, $agpr0, 32, $exec, 64
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr41, $agpr1, 32, $exec, 64
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr42, $agpr2, 32, $exec, 64
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr43, $agpr3, 32, $exec, 64
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr44, $agpr4, 32, $exec, 64
; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr45, $agpr5, 32, $exec, 64
; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr46, $agpr6, 32, $exec, 64
; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr47, $agpr7, 32, $exec, 64
; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr56, $agpr8, 32, $exec, 64
; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr57, $agpr9, 32, $exec, 64
; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr58, $agpr10, 32, $exec, 64
; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr59, $agpr11, 32, $exec, 64
; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr60, $agpr12, 32, $exec, 64
; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr61, $agpr13, 32, $exec, 64
; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr62, $agpr14, 32, $exec, 64
; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr63, $agpr15, 32, $exec, 64
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -868,21 +916,37 @@ body: |
; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 3840
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 3584
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 3328
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 3072
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec, 64, 2816
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 2560
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec, 64, 2304
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec, 64, 2048
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec, 64, 1792
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec, 64, 1536
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec, 64, 1280
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec, 64, 1024
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec, 64, 768
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec, 64, 512
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec, 64, 256
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.17, addrspace 5)
+ ; GFX8-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec, 64, 0
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -979,21 +1043,37 @@ body: |
; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr55
; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 3840
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.3, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 3584
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.4, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 3328
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.5, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 3072
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.6, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec, 64, 2816
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.7, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 2560
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.8, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec, 64, 2304
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.9, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec, 64, 2048
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.10, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec, 64, 1792
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.11, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec, 64, 1536
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.12, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec, 64, 1280
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.13, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec, 64, 1024
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.14, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec, 64, 768
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.15, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec, 64, 512
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.16, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec, 64, 256
; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.17, addrspace 5)
+ ; GFX900-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec, 64, 0
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1105,21 +1185,37 @@ body: |
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr15
; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr4
; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr40, $agpr0, 32, $exec, 64
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr41, $agpr1, 32, $exec, 64
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr42, $agpr2, 32, $exec, 64
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr43, $agpr3, 32, $exec, 64
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr44, $agpr4, 32, $exec, 64
; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr45, $agpr5, 32, $exec, 64
; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr46, $agpr6, 32, $exec, 64
; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr47, $agpr7, 32, $exec, 64
; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr56, $agpr8, 32, $exec, 64
; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr57, $agpr9, 32, $exec, 64
; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr58, $agpr10, 32, $exec, 64
; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr59, $agpr11, 32, $exec, 64
; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr60, $agpr12, 32, $exec, 64
; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr61, $agpr13, 32, $exec, 64
; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr62, $agpr14, 32, $exec, 64
; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+ ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $vgpr63, $agpr15, 32, $exec, 64
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 6abe5998d6767..2760c7a2187b4 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -16,13 +16,13 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9]
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s4, 2
-; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0
+; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400
+; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1
; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0
; SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5]
; SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1
; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index aff30d682f20a..fea6a729ed986 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -2055,14 +2055,14 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s19, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[20:21]
; GFX9-NEXT: s_add_u32 s20, s20, caller_void_func_i32_v2float_inreg at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s21, s21, caller_void_func_i32_v2float_inreg at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[20:21], s[20:21], 0x0
-; GFX9-NEXT: v_writelane_b32 v40, s19, 2
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -2084,17 +2084,16 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
; GFX11-NEXT: s_or_saveexec_b32 s16, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s16
+; GFX11-NEXT: v_writelane_b32 v40, s3, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[16:17]
; GFX11-NEXT: s_add_u32 s16, s16, caller_void_func_i32_v2float_inreg at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s17, s17, caller_void_func_i32_v2float_inreg at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s3, 2
; GFX11-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -2457,21 +2456,24 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
; GFX9-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-NEXT: v_writelane_b32 v40, s29, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v2, s28
; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48
; GFX9-NEXT: v_mov_b32_e32 v5, s27
; GFX9-NEXT: v_mov_b32_e32 v4, s26
; GFX9-NEXT: v_mov_b32_e32 v3, s25
; GFX9-NEXT: v_mov_b32_e32 v2, s24
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32
-; GFX9-NEXT: v_writelane_b32 v40, s29, 2
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v5, s23
; GFX9-NEXT: v_mov_b32_e32 v4, s22
; GFX9-NEXT: v_mov_b32_e32 v3, s21
; GFX9-NEXT: v_mov_b32_e32 v2, s20
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v3, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s16
; GFX9-NEXT: s_getpc_b64 s[16:17]
@@ -2480,7 +2482,6 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_mov_b32_e32 v5, s19
; GFX9-NEXT: v_mov_b32_e32 v4, s18
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -2503,7 +2504,10 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
; GFX11-NEXT: s_or_saveexec_b32 s26, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s26
+; GFX11-NEXT: v_writelane_b32 v40, s25, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v3, s21
; GFX11-NEXT: v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v9, s19
; GFX11-NEXT: s_getpc_b64 s[20:21]
@@ -2512,20 +2516,16 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s17
; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v13, s3
; GFX11-NEXT: s_load_b64 s[16:17], s[20:21], 0x0
-; GFX11-NEXT: v_writelane_b32 v40, s25, 2
; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v5, s23
; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index f22ba70d32ae5..8bcf47fa6dbd7 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -13,6 +13,7 @@ define amdgpu_gfx void @gfx_func() #0 {
; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1
; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; SDAG-NEXT: s_mov_b64 exec, s[34:35]
+; SDAG-NEXT: s_addk_i32 s32, 0x400
; SDAG-NEXT: v_writelane_b32 v40, s4, 0
; SDAG-NEXT: v_writelane_b32 v40, s5, 1
; SDAG-NEXT: v_writelane_b32 v40, s6, 2
@@ -56,11 +57,10 @@ define amdgpu_gfx void @gfx_func() #0 {
; SDAG-NEXT: v_writelane_b32 v40, s94, 40
; SDAG-NEXT: v_writelane_b32 v40, s95, 41
; SDAG-NEXT: v_writelane_b32 v40, s30, 42
+; SDAG-NEXT: v_writelane_b32 v40, s31, 43
; SDAG-NEXT: s_mov_b32 s35, extern_c_func at abs32@hi
; SDAG-NEXT: s_mov_b32 s34, extern_c_func at abs32@lo
; SDAG-NEXT: s_mov_b64 s[8:9], 0
-; SDAG-NEXT: s_addk_i32 s32, 0x400
-; SDAG-NEXT: v_writelane_b32 v40, s31, 43
; SDAG-NEXT: s_swappc_b64 s[30:31], s[34:35]
; SDAG-NEXT: v_readlane_b32 s30, v40, 42
; SDAG-NEXT: v_readlane_b32 s31, v40, 43
@@ -122,6 +122,7 @@ define amdgpu_gfx void @gfx_func() #0 {
; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
+; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s4, 0
; GISEL-NEXT: v_writelane_b32 v40, s5, 1
; GISEL-NEXT: v_writelane_b32 v40, s6, 2
@@ -165,11 +166,10 @@ define amdgpu_gfx void @gfx_func() #0 {
; GISEL-NEXT: v_writelane_b32 v40, s94, 40
; GISEL-NEXT: v_writelane_b32 v40, s95, 41
; GISEL-NEXT: v_writelane_b32 v40, s30, 42
+; GISEL-NEXT: v_writelane_b32 v40, s31, 43
; GISEL-NEXT: s_mov_b32 s34, extern_c_func at abs32@lo
; GISEL-NEXT: s_mov_b32 s35, extern_c_func at abs32@hi
; GISEL-NEXT: s_mov_b64 s[8:9], 0
-; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s31, 43
; GISEL-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GISEL-NEXT: v_readlane_b32 s30, v40, 42
; GISEL-NEXT: v_readlane_b32 s31, v40, 43
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 3f5ad3fc6e347..100707a5041d7 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -133,12 +133,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i1 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i1 at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -162,13 +162,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: s_mov_b32 s35, external_void_func_i1 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_i1 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -191,15 +191,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: v_mov_b32_e32 v0, 1
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_mov_b32_e32 v0, 1
; GFX11-NEXT: s_mov_b32 s1, external_void_func_i1 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_i1 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -221,13 +220,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i1 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i1 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -253,14 +252,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i1_signext at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i1_signext at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -284,14 +283,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i1_signext at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i1_signext at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i1_signext at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i1_signext at abs32@lo
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -315,14 +314,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i1_signext at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i1_signext at abs32@lo
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i1_signext at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i1_signext at abs32@lo
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -346,14 +345,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
-; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i1_signext at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i1_signext at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i1_signext at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i1_signext at abs32@lo
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -382,14 +381,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i1_zeroext at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i1_zeroext at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -413,14 +412,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i1_zeroext at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i1_zeroext at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i1_zeroext at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i1_zeroext at abs32@lo
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -444,14 +443,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i1_zeroext at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i1_zeroext at abs32@lo
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i1_zeroext at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i1_zeroext at abs32@lo
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -475,14 +474,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
-; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i1_zeroext at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i1_zeroext at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i1_zeroext at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i1_zeroext at abs32@lo
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -513,11 +512,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -540,12 +539,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -568,14 +567,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8 at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8 at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8 at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8 at abs32@lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -596,14 +595,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8 at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8 at abs32@lo
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8 at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8 at abs32@lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -625,12 +624,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -656,14 +655,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_signext at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_signext at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_signext at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_signext at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -685,14 +684,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_signext at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_signext at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_signext at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_signext at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -714,16 +713,16 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: global_load_d16_i8 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: global_load_d16_i8 v0, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -743,16 +742,16 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: global_load_i8 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: global_load_i8 v0, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -773,14 +772,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
-; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
+; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -807,14 +806,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_zeroext at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_zeroext at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_zeroext at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_zeroext at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -836,14 +835,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_zeroext at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_zeroext at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_zeroext at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_zeroext at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -865,16 +864,16 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -894,16 +893,16 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -924,14 +923,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
-; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -960,11 +959,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -987,12 +986,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -1015,14 +1014,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16 at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16 at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16 at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16 at abs32@lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -1043,14 +1042,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16 at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16 at abs32@lo
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16 at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16 at abs32@lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -1072,12 +1071,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -1103,14 +1102,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16_signext at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16_signext at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16_signext at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16_signext at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1132,14 +1131,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_signext at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_signext at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_signext at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_signext at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -1161,16 +1160,16 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -1190,16 +1189,16 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -1220,14 +1219,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc
-; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc
+; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -1254,14 +1253,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16_zeroext at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16_zeroext at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16_zeroext at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16_zeroext at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1283,14 +1282,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_zeroext at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_zeroext at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_zeroext at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_zeroext at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -1312,16 +1311,16 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -1341,16 +1340,16 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -1371,14 +1370,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc
-; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc
+; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -1407,11 +1406,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1434,12 +1433,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_mov_b32_e32 v0, 42
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i32 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i32 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i32 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i32 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -1462,14 +1461,14 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i32 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i32 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i32 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i32 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1491,12 +1490,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i32 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i32 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i32 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i32 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -1524,12 +1523,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i64 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1552,13 +1551,13 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_i64 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_i64 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -1581,14 +1580,14 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_i64 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_i64 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1610,13 +1609,13 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i64 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i64 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -1642,15 +1641,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64 at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1672,15 +1671,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -1702,16 +1701,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64 at abs32@lo
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1732,15 +1730,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -1769,14 +1767,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
; GFX9-NEXT: v_mov_b32_e32 v2, 3
; GFX9-NEXT: v_mov_b32_e32 v3, 4
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1799,15 +1797,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -1830,15 +1828,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1860,15 +1858,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -1894,17 +1892,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i64 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v4, 1
; GFX9-NEXT: v_mov_b32_e32 v5, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -1926,17 +1924,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_mov_b32_e32 v4, 1
; GFX10-NEXT: v_mov_b32_e32 v5, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i64 at abs32@hi
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i64 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -1958,17 +1956,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i64 at abs32@hi
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i64 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1989,17 +1986,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i64 at abs32@hi
-; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i64 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -2028,19 +2025,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i64 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v4, 1
; GFX9-NEXT: v_mov_b32_e32 v5, 2
; GFX9-NEXT: v_mov_b32_e32 v6, 3
; GFX9-NEXT: v_mov_b32_e32 v7, 4
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -2062,19 +2059,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_mov_b32_e32 v4, 1
; GFX10-NEXT: v_mov_b32_e32 v5, 2
; GFX10-NEXT: v_mov_b32_e32 v6, 3
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v7, 4
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -2096,18 +2093,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i64 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i64 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -2128,19 +2125,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3
-; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4
+; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i64 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i64 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -2170,11 +2167,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_f16 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_f16 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -2197,12 +2194,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_f16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_f16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_f16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_f16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -2225,14 +2222,14 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4400
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_f16 at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_f16 at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4400
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_f16 at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_f16 at abs32@lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -2253,14 +2250,14 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_f16 at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_f16 at abs32@lo
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_f16 at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_f16 at abs32@lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -2282,12 +2279,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -2315,11 +2312,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_f32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_f32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -2342,12 +2339,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_f32 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_f32 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_f32 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_f32 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -2370,14 +2367,14 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_f32 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_f32 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_f32 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_f32 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -2399,12 +2396,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f32 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f32 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f32 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f32 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -2432,12 +2429,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -2460,13 +2457,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -2489,14 +2486,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -2518,13 +2515,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -2552,13 +2549,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
; GFX9-NEXT: v_mov_b32_e32 v2, 4.0
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -2581,14 +2578,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 4.0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f32 at abs32@hi
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f32 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -2611,15 +2608,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_mov_b32_e32 v2, 4.0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f32 at abs32@hi
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f32 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -2641,14 +2638,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f32 at abs32@hi
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f32 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -2676,6 +2673,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5f32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5f32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
@@ -2683,8 +2682,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-NEXT: v_mov_b32_e32 v2, 4.0
; GFX9-NEXT: v_mov_b32_e32 v3, -1.0
; GFX9-NEXT: v_mov_b32_e32 v4, 0.5
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -2707,16 +2704,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 4.0
; GFX10-NEXT: v_mov_b32_e32 v3, -1.0
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0.5
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5f32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5f32 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -2739,16 +2736,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 0.5
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5f32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5f32 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -2770,16 +2767,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5f32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5f32 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -2807,12 +2804,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_f64 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_f64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -2835,13 +2832,13 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX10-NEXT: s_mov_b32 s35, external_void_func_f64 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_f64 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -2864,14 +2861,14 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
; GFX11-NEXT: s_mov_b32 s1, external_void_func_f64 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_f64 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -2893,13 +2890,13 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f64 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f64 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -2927,14 +2924,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f64 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -2957,15 +2954,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f64 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f64 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -2988,15 +2985,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f64 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f64 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -3018,15 +3015,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f64 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f64 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -3054,6 +3051,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f64 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -3062,8 +3061,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -3086,17 +3083,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f64 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f64 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -3119,16 +3116,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f64 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f64 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -3150,17 +3147,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f64 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f64 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -3186,16 +3183,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_mov_b32_e32 v1, 8
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i8 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i8 at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -3219,16 +3216,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i8 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i8 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -3252,14 +3249,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3284,14 +3281,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3317,16 +3314,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -3355,15 +3352,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i8 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i8 at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3388,15 +3385,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i8 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i8 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3421,14 +3418,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i8 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i8 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i8 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i8 at abs32@lo
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3453,15 +3450,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i8 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i8 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3491,15 +3488,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i8 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i8 at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
@@ -3525,15 +3522,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i8 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i8 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3559,14 +3556,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i8 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i8 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i8 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i8 at abs32@lo
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3592,15 +3589,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i8 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i8 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3631,15 +3628,15 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5i8 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5i8 at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -3667,15 +3664,15 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i8 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i8 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -3703,14 +3700,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: global_load_b64 v[5:6], v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
+; GFX11-NEXT: global_load_b64 v[5:6], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -3738,15 +3735,15 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -3779,15 +3776,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i8 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i8 at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3818,15 +3815,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i8 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i8 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3857,14 +3854,14 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i8 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i8 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i8 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i8 at abs32@lo
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3894,15 +3891,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i8 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i8 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3938,18 +3935,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 16
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: global_load_dwordx4 v[16:19], v[4:5], off
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i8 at abs32@hi
+; GFX9-NEXT: global_load_dwordx4 v[16:19], v[4:5], off
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i8 at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0
@@ -4009,18 +4006,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 16
; GFX10-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i8 at abs32@hi
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i8 at abs32@lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[16:19], v[4:5], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_lshrrev_b32_e32 v35, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v0
@@ -4080,16 +4078,17 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 16 :: v_dual_mov_b32 v5, 0
-; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i8 at abs32@hi
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i8 at abs32@lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: global_load_b128 v[16:19], v[4:5], off
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v0
@@ -4146,18 +4145,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 16
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i8 at abs32@hi
-; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i8 at abs32@lo
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[16:19], v[4:5], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v35, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v36, 16, v0
@@ -4223,17 +4223,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v42, s34, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v42, s30, 0
+; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: v_mov_b32_e32 v41, 0
; GFX9-NEXT: global_load_ubyte v0, v[40:41], off
-; GFX9-NEXT: v_writelane_b32 v42, s34, 2
-; GFX9-NEXT: v_writelane_b32 v42, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_ret at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_ret at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: global_store_byte v[40:41], v0, off
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4259,17 +4259,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v42, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v40, 0
; GFX10-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-NEXT: v_writelane_b32 v42, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_ret at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_ret at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_ubyte v0, v[40:41], off
-; GFX10-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: global_store_byte v[40:41], v0, off
; GFX10-NEXT: s_clause 0x1 ; 8-byte Folded Reload
@@ -4295,17 +4295,18 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v[40:41], off
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: global_store_b8 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
@@ -4330,17 +4331,18 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: global_load_u8 v0, v[40:41], off
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: global_store_b8 v[40:41], v0, off
; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
@@ -4366,17 +4368,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[40:41], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: global_store_byte v[40:41], v0, off
; GFX10-SCRATCH-NEXT: s_clause 0x1 ; 8-byte Folded Reload
@@ -4409,18 +4411,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v42, s34, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v42, s30, 0
+; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: v_mov_b32_e32 v41, 0
; GFX9-NEXT: global_load_ushort v0, v[40:41], off
-; GFX9-NEXT: v_writelane_b32 v42, s34, 2
; GFX9-NEXT: v_mov_b32_e32 v1, 8
-; GFX9-NEXT: v_writelane_b32 v42, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i8_ret at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i8_ret at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -4450,18 +4452,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v42, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v40, 0
; GFX10-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-NEXT: v_writelane_b32 v42, s34, 2
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i8_ret at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i8_ret at abs32@lo
; GFX10-NEXT: global_load_ushort v0, v[40:41], off
-; GFX10-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -4491,17 +4493,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[40:41], off
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -4534,17 +4537,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[40:41], off
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -4577,18 +4581,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[40:41], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -4625,17 +4629,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v42, s34, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v42, s30, 0
+; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: v_mov_b32_e32 v41, 0
; GFX9-NEXT: global_load_dword v0, v[40:41], off
-; GFX9-NEXT: v_writelane_b32 v42, s34, 2
-; GFX9-NEXT: v_writelane_b32 v42, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -4669,17 +4673,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v42, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v40, 0
; GFX10-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-NEXT: v_writelane_b32 v42, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dword v0, v[40:41], off
-; GFX10-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -4712,17 +4716,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[40:41], off
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -4755,17 +4760,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[40:41], off
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -4799,17 +4805,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dword v0, v[40:41], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -4849,17 +4855,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v42, s34, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v42, s30, 0
+; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: v_mov_b32_e32 v41, 0
; GFX9-NEXT: global_load_dword v0, v[40:41], off
-; GFX9-NEXT: v_writelane_b32 v42, s34, 2
-; GFX9-NEXT: v_writelane_b32 v42, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i8_ret at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i8_ret at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
@@ -4893,17 +4899,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v42, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v40, 0
; GFX10-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-NEXT: v_writelane_b32 v42, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i8_ret at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i8_ret at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dword v0, v[40:41], off
-; GFX10-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -4936,17 +4942,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[40:41], off
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -4980,17 +4987,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[40:41], off
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -5024,17 +5032,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dword v0, v[40:41], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -5074,17 +5082,17 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v42, s34, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v42, s30, 0
+; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: v_mov_b32_e32 v41, 0
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[40:41], off
-; GFX9-NEXT: v_writelane_b32 v42, s34, 2
-; GFX9-NEXT: v_writelane_b32 v42, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5i8_ret at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5i8_ret at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -5123,17 +5131,17 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v42, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v40, 0
; GFX10-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-NEXT: v_writelane_b32 v42, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i8_ret at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i8_ret at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[40:41], off
-; GFX10-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -5171,17 +5179,18 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: global_load_b64 v[5:6], v[40:41], off
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -5219,17 +5228,18 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: global_load_b64 v[5:6], v[40:41], off
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -5268,17 +5278,17 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[5:6], v[40:41], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v5
@@ -5323,17 +5333,17 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v42, s34, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v42, s30, 0
+; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: v_mov_b32_e32 v41, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[40:41], off
-; GFX9-NEXT: v_writelane_b32 v42, s34, 2
-; GFX9-NEXT: v_writelane_b32 v42, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i8_ret at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i8_ret at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -5375,17 +5385,17 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v42, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v40, 0
; GFX10-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-NEXT: v_writelane_b32 v42, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i8_ret at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i8_ret at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[40:41], off
-; GFX10-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -5426,17 +5436,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT: ; meta instruction
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[40:41], off
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -5478,17 +5489,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT: ; meta instruction
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[40:41], off
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -5530,17 +5542,17 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[40:41], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v8, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -5588,22 +5600,22 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v44, s34, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v44, s30, 0
+; GFX9-NEXT: v_writelane_b32 v44, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: v_mov_b32_e32 v41, 0
; GFX9-NEXT: v_mov_b32_e32 v42, 16
; GFX9-NEXT: v_mov_b32_e32 v43, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[40:41], off
; GFX9-NEXT: global_load_dwordx4 v[16:19], v[42:43], off
-; GFX9-NEXT: v_writelane_b32 v44, s34, 2
-; GFX9-NEXT: v_writelane_b32 v44, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: v_writelane_b32 v44, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0
@@ -5695,22 +5707,23 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v44, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v44, s30, 0
+; GFX10-NEXT: v_writelane_b32 v44, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v40, 0
; GFX10-NEXT: v_mov_b32_e32 v41, 0
; GFX10-NEXT: v_mov_b32_e32 v42, 16
; GFX10-NEXT: v_mov_b32_e32 v43, 0
-; GFX10-NEXT: v_writelane_b32 v44, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[40:41], off
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[40:41], off
; GFX10-NEXT: global_load_dwordx4 v[16:19], v[42:43], off
-; GFX10-NEXT: v_writelane_b32 v44, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x400
-; GFX10-NEXT: v_writelane_b32 v44, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_lshrrev_b32_e32 v35, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v0
@@ -5801,21 +5814,25 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: v_writelane_b32 v44, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s33
+; GFX11-NEXT: v_writelane_b32 v44, s30, 0
+; GFX11-NEXT: v_writelane_b32 v44, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
; GFX11-NEXT: v_dual_mov_b32 v42, 16 :: v_dual_mov_b32 v43, 0
-; GFX11-NEXT: v_writelane_b32 v44, s0, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
-; GFX11-NEXT: global_load_b128 v[0:3], v[40:41], off
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[0:3], v[40:41], off
; GFX11-NEXT: global_load_b128 v[16:19], v[42:43], off
-; GFX11-NEXT: v_writelane_b32 v44, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 32
-; GFX11-NEXT: v_writelane_b32 v44, s31, 1
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v0
@@ -5904,22 +5921,23 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v44, s33 offset:16 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:12 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 16
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v43, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
-; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[40:41], off
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[40:41], off
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[16:19], v[42:43], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v35, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v36, 16, v0
@@ -6018,13 +6036,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6046,13 +6064,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -6074,15 +6092,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -6103,13 +6121,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -6136,13 +6154,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6164,13 +6182,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -6192,15 +6210,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -6221,13 +6239,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -6254,13 +6272,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6282,13 +6300,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -6310,15 +6328,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -6339,13 +6357,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -6374,12 +6392,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: v_mov_b32_e32 v1, 3
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6402,13 +6420,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: v_mov_b32_e32 v1, 3
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -6431,14 +6449,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -6460,13 +6478,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -6494,12 +6512,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6522,13 +6540,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -6551,15 +6569,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -6581,13 +6599,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -6613,13 +6631,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6641,13 +6659,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -6669,15 +6687,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -6698,13 +6716,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -6733,12 +6751,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6761,13 +6779,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -6790,15 +6808,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -6820,13 +6838,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -6852,13 +6870,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6880,13 +6898,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -6908,15 +6926,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -6937,13 +6955,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -6970,13 +6988,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -6998,13 +7016,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -7026,15 +7044,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7055,13 +7073,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -7090,12 +7108,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7118,13 +7136,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -7147,14 +7165,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7176,13 +7194,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -7210,13 +7228,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: v_mov_b32_e32 v1, 4
; GFX9-NEXT: v_mov_b32_e32 v2, 5
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7239,14 +7257,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 4
; GFX10-NEXT: v_mov_b32_e32 v2, 5
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32 at abs32@hi
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -7269,15 +7287,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT: v_mov_b32_e32 v2, 5
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32 at abs32@hi
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7299,14 +7317,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32 at abs32@hi
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -7334,14 +7352,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: v_mov_b32_e32 v1, 4
; GFX9-NEXT: v_mov_b32_e32 v2, 5
; GFX9-NEXT: v_mov_b32_e32 v3, 6
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7364,15 +7382,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 4
; GFX10-NEXT: v_mov_b32_e32 v2, 5
; GFX10-NEXT: v_mov_b32_e32 v3, 6
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -7395,15 +7413,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7425,15 +7443,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -7459,13 +7477,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7487,13 +7505,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -7515,15 +7533,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7544,13 +7562,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -7579,14 +7597,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
; GFX9-NEXT: v_mov_b32_e32 v2, 3
; GFX9-NEXT: v_mov_b32_e32 v3, 4
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7609,15 +7627,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -7640,15 +7658,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7670,15 +7688,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -7706,6 +7724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5i32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 1
@@ -7713,8 +7733,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-NEXT: v_mov_b32_e32 v2, 3
; GFX9-NEXT: v_mov_b32_e32 v3, 4
; GFX9-NEXT: v_mov_b32_e32 v4, 5
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7737,16 +7755,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 5
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i32 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -7769,16 +7787,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 5
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i32 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7800,16 +7818,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -7836,16 +7854,16 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35]
; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -7868,17 +7886,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35]
; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -7901,19 +7920,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1]
; GFX11-NEXT: global_load_b128 v[4:7], v4, s[0:1] offset:16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -7935,17 +7954,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -7975,6 +7995,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 1
@@ -7985,8 +8007,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-NEXT: v_mov_b32_e32 v5, 6
; GFX9-NEXT: v_mov_b32_e32 v6, 7
; GFX9-NEXT: v_mov_b32_e32 v7, 8
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -8009,19 +8029,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 5
; GFX10-NEXT: v_mov_b32_e32 v5, 6
; GFX10-NEXT: v_mov_b32_e32 v6, 7
; GFX10-NEXT: v_mov_b32_e32 v7, 8
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -8044,17 +8064,17 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -8076,19 +8096,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -8115,10 +8135,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v16, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v16, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[34:35]
; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
@@ -8126,7 +8147,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -8149,19 +8169,20 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[34:35]
; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32
; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -8184,21 +8205,21 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v12, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[0:3], v12, s[0:1]
; GFX11-NEXT: global_load_b128 v[4:7], v12, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[8:11], v12, s[0:1] offset:32
; GFX11-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:48
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16i32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -8220,19 +8241,20 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x3
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -8261,10 +8283,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v28, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v28, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35]
; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[34:35] offset:16
@@ -8277,7 +8300,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[34:35] offset:112
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -8300,9 +8322,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[34:35]
@@ -8313,10 +8337,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -8339,9 +8362,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v28, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_load_b128 v[0:3], v28, s[0:1]
@@ -8352,12 +8377,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80
; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -8379,9 +8402,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x7
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
@@ -8392,10 +8417,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -8424,10 +8448,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v28, 0
; GFX9-NEXT: global_load_dword v32, v[0:1], off
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35]
; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[34:35] offset:16
@@ -8438,10 +8464,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[34:35] offset:96
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[34:35] offset:112
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32 at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -8466,9 +8490,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dword v33, v[0:1], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
@@ -8480,10 +8506,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -8508,9 +8533,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v28, 0
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: global_load_b32 v32, v[0:1], off
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x7
@@ -8522,10 +8549,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80
; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: scratch_store_b32 off, v32, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8550,9 +8575,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x7
@@ -8564,10 +8591,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8)
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8599,15 +8625,15 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v42, s34, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v42, s30, 0
+; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v40, v0
; GFX9-NEXT: s_mov_b32 s35, external_i32_func_i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_i32_func_i32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v42, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v41, v1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: global_store_dword v[40:41], v0, off
@@ -8635,16 +8661,16 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v42, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v40, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: s_mov_b32 s35, external_i32_func_i32 at abs32@hi
-; GFX10-NEXT: v_writelane_b32 v42, s30, 0
; GFX10-NEXT: s_mov_b32 s34, external_i32_func_i32 at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_mov_b32_e32 v41, v1
-; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: global_store_dword v[40:41], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8672,16 +8698,18 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_clause 0x1 ; 8-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: v_writelane_b32 v42, s30, 0
; GFX11-NEXT: s_mov_b32 s1, external_i32_func_i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_i32_func_i32 at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v42, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[40:41], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8709,16 +8737,16 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, v0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_i32_func_i32 at abs32@hi
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_i32_func_i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8751,16 +8779,16 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v2, s[34:35]
; GFX9-NEXT: global_load_dword v1, v2, s[34:35] offset:4
; GFX9-NEXT: s_mov_b32 s35, external_void_func_struct_i8_i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -8783,17 +8811,18 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35]
; GFX10-NEXT: global_load_dword v1, v2, s[34:35] offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 s35, external_void_func_struct_i8_i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -8816,19 +8845,19 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[0:1] offset:4
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -8849,19 +8878,19 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u8 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -8883,17 +8912,18 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1]
; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -8921,17 +8951,17 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX9-NEXT: s_mov_b32 s35, external_void_func_byval_struct_i8_i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_byval_struct_i8_i32 at abs32@lo
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -8954,16 +8984,16 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: s_mov_b32 s35, external_void_func_byval_struct_i8_i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_byval_struct_i8_i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -8986,19 +9016,18 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 3
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 8
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_byval_struct_i8_i32 at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_byval_struct_i8_i32 at abs32@lo
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b8 off, v0, s33
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v1, s33 offset:4
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s33
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -9019,18 +9048,17 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_byval_struct_i8_i32 at abs32@hi
; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_byval_struct_i8_i32 at abs32@lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_store_b8 off, v0, s33
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v1, s33 offset:4
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s33
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
@@ -9052,16 +9080,16 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_byval_struct_i8_i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_byval_struct_i8_i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -9092,19 +9120,19 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x800
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
; GFX9-NEXT: v_add_u32_e32 v0, 8, v0
-; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33
-; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12
@@ -9133,19 +9161,19 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x400
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33
-; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8
@@ -9177,17 +9205,17 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 32
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 3
; GFX11-TRUE16-NEXT: s_add_i32 s2, s33, 8
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b8 off, v0, s33
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v1, s33 offset:4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s33
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s33 offset:8
@@ -9217,17 +9245,17 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 32
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s33, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s33, 8
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_store_b8 off, v0, s33
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v1, s33 offset:4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s33
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_load_u8 v0, off, s33 offset:8
@@ -9258,18 +9286,18 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: s_add_i32 s2, s33, 8
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s33
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: scratch_load_ubyte v0, off, s33 offset:8
@@ -9317,11 +9345,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35]
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16i8 at abs32@hi
@@ -9367,11 +9395,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35]
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
@@ -9418,11 +9446,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1]
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16i8 at abs32@hi
@@ -9466,11 +9494,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
@@ -9522,9 +9550,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s34, 0
; GFX9-NEXT: v_writelane_b32 v40, s35, 1
; GFX9-NEXT: v_writelane_b32 v40, s36, 2
@@ -9539,11 +9565,13 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s53, 11
; GFX9-NEXT: v_writelane_b32 v40, s54, 12
; GFX9-NEXT: v_writelane_b32 v40, s55, 13
-; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s30, 14
+; GFX9-NEXT: v_writelane_b32 v40, s31, 15
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33
; GFX9-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
; GFX9-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 15
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(2)
@@ -9582,18 +9610,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33
-; GFX10-NEXT: v_writelane_b32 v40, s34, 0
; GFX10-NEXT: s_addk_i32 s32, 0x400
-; GFX10-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GFX10-NEXT: v_writelane_b32 v40, s34, 0
; GFX10-NEXT: v_writelane_b32 v40, s35, 1
; GFX10-NEXT: v_writelane_b32 v40, s36, 2
; GFX10-NEXT: v_writelane_b32 v40, s37, 3
@@ -9609,6 +9627,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s55, 13
; GFX10-NEXT: v_writelane_b32 v40, s30, 14
; GFX10-NEXT: v_writelane_b32 v40, s31, 15
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33
+; GFX10-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_readlane_b32 s30, v40, 14
; GFX10-NEXT: v_readlane_b32 s31, v40, 15
@@ -9643,13 +9671,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16
-; GFX11-NEXT: scratch_load_b32 v31, off, s33
-; GFX11-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-NEXT: s_add_i32 s32, s32, 32
-; GFX11-NEXT: s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
+; GFX11-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-NEXT: v_writelane_b32 v40, s35, 1
; GFX11-NEXT: v_writelane_b32 v40, s36, 2
; GFX11-NEXT: v_writelane_b32 v40, s37, 3
@@ -9665,6 +9688,11 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s55, 13
; GFX11-NEXT: v_writelane_b32 v40, s30, 14
; GFX11-NEXT: v_writelane_b32 v40, s31, 15
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16
+; GFX11-NEXT: scratch_load_b32 v31, off, s33
+; GFX11-NEXT: s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -9701,13 +9729,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: s_clause 0x1
-; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16
-; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s35, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 3
@@ -9723,6 +9746,11 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 13
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 14
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 15
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16
+; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1)
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -9767,12 +9795,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i1_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i1_inreg at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -9796,13 +9824,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: s_mov_b32 s35, external_void_func_i1_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_i1_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -9825,15 +9853,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: v_mov_b32_e32 v0, 1
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_mov_b32_e32 v0, 1
; GFX11-NEXT: s_mov_b32 s1, external_void_func_i1_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_i1_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -9855,13 +9882,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i1_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i1_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -9888,13 +9915,13 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_inreg at abs32@lo
; GFX9-NEXT: s_movk_i32 s4, 0x7b
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
@@ -9918,13 +9945,13 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_inreg at abs32@lo
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
@@ -9948,15 +9975,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_inreg at abs32@lo
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -9979,13 +10005,13 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
@@ -10013,13 +10039,13 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16_inreg at abs32@lo
; GFX9-NEXT: s_movk_i32 s4, 0x7b
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
@@ -10043,13 +10069,13 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_inreg at abs32@lo
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
@@ -10073,15 +10099,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i16_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i16_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i16_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i16_inreg at abs32@lo
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -10104,13 +10129,13 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
@@ -10138,13 +10163,13 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 42
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
@@ -10168,13 +10193,13 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 42
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 42
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
@@ -10198,15 +10223,14 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 42
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 42
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -10229,13 +10253,13 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
@@ -10263,15 +10287,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_mov_b32 s35, external_void_func_i64_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_i64_inreg at abs32@lo
; GFX9-NEXT: s_movk_i32 s4, 0x7b
; GFX9-NEXT: s_mov_b32 s5, 0
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -10296,15 +10320,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 4
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_i64_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_i64_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_i64_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_i64_inreg at abs32@lo
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -10329,17 +10353,16 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 4
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i64_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i64_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i64_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i64_inreg at abs32@lo
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -10363,15 +10386,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i64_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -10400,17 +10423,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 6
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
-; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
+; GFX9-NEXT: v_writelane_b32 v40, s31, 5
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
; GFX9-NEXT: v_readlane_b32 s31, v40, 5
@@ -10437,17 +10460,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 6
-; GFX10-NEXT: s_mov_b64 s[34:35], 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
+; GFX10-NEXT: v_writelane_b32 v40, s30, 4
+; GFX10-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-NEXT: s_mov_b64 s[34:35], 0
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 4
-; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-NEXT: v_readlane_b32 s31, v40, 5
@@ -10474,19 +10497,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 6
-; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
+; GFX11-NEXT: v_writelane_b32 v40, s30, 4
+; GFX11-NEXT: v_writelane_b32 v40, s31, 5
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 4
-; GFX11-NEXT: v_writelane_b32 v40, s31, 5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -10512,17 +10535,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
@@ -10554,19 +10577,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 6
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
+; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 1
; GFX9-NEXT: s_mov_b32 s5, 2
; GFX9-NEXT: s_mov_b32 s6, 3
; GFX9-NEXT: s_mov_b32 s7, 4
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
; GFX9-NEXT: v_readlane_b32 s31, v40, 5
@@ -10593,19 +10616,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 6
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_mov_b32 s7, 4
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
+; GFX10-NEXT: s_mov_b32 s6, 3
+; GFX10-NEXT: s_mov_b32 s7, 4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-NEXT: v_readlane_b32 s31, v40, 5
@@ -10632,21 +10655,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 6
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_mov_b32 s7, 4
; GFX11-NEXT: v_writelane_b32 v40, s30, 4
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
+; GFX11-NEXT: s_mov_b32 s6, 3
+; GFX11-NEXT: s_mov_b32 s7, 4
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -10672,19 +10694,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
@@ -10715,21 +10737,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 8
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
-; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
; GFX9-NEXT: v_writelane_b32 v40, s30, 6
+; GFX9-NEXT: v_writelane_b32 v40, s31, 7
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s8, 1
; GFX9-NEXT: s_mov_b32 s9, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 7
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 6
; GFX9-NEXT: v_readlane_b32 s31, v40, 7
@@ -10758,21 +10780,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 8
-; GFX10-NEXT: s_mov_b64 s[34:35], 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-NEXT: s_mov_b32 s8, 1
; GFX10-NEXT: v_writelane_b32 v40, s9, 5
-; GFX10-NEXT: s_mov_b32 s9, 2
; GFX10-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-NEXT: v_writelane_b32 v40, s31, 7
+; GFX10-NEXT: s_mov_b64 s[34:35], 0
+; GFX10-NEXT: s_mov_b32 s8, 1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s9, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 6
; GFX10-NEXT: v_readlane_b32 s31, v40, 7
@@ -10801,23 +10823,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 8
-; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg at abs32@lo
; GFX11-NEXT: v_writelane_b32 v40, s8, 4
-; GFX11-NEXT: s_mov_b32 s8, 1
; GFX11-NEXT: v_writelane_b32 v40, s9, 5
-; GFX11-NEXT: s_mov_b32 s9, 2
; GFX11-NEXT: v_writelane_b32 v40, s30, 6
; GFX11-NEXT: v_writelane_b32 v40, s31, 7
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: s_mov_b32 s8, 1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s9, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 6
; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
@@ -10845,21 +10866,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8
-; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7
+; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1
+; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
@@ -10895,25 +10916,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 10
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
-; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
; GFX9-NEXT: v_writelane_b32 v40, s10, 6
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
; GFX9-NEXT: v_writelane_b32 v40, s30, 8
+; GFX9-NEXT: v_writelane_b32 v40, s31, 9
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s8, 1
; GFX9-NEXT: s_mov_b32 s9, 2
; GFX9-NEXT: s_mov_b32 s10, 3
; GFX9-NEXT: s_mov_b32 s11, 4
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
; GFX9-NEXT: v_readlane_b32 s31, v40, 9
@@ -10944,25 +10965,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 10
-; GFX10-NEXT: s_mov_b64 s[34:35], 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-NEXT: s_mov_b32 s8, 1
; GFX10-NEXT: v_writelane_b32 v40, s9, 5
-; GFX10-NEXT: s_mov_b32 s9, 2
; GFX10-NEXT: v_writelane_b32 v40, s10, 6
-; GFX10-NEXT: s_mov_b32 s10, 3
; GFX10-NEXT: v_writelane_b32 v40, s11, 7
-; GFX10-NEXT: s_mov_b32 s11, 4
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
+; GFX10-NEXT: s_mov_b64 s[34:35], 0
+; GFX10-NEXT: s_mov_b32 s8, 1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s9, 2
+; GFX10-NEXT: s_mov_b32 s10, 3
+; GFX10-NEXT: s_mov_b32 s11, 4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
; GFX10-NEXT: v_readlane_b32 s31, v40, 9
@@ -10993,27 +11014,26 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 10
-; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg at abs32@lo
; GFX11-NEXT: v_writelane_b32 v40, s8, 4
-; GFX11-NEXT: s_mov_b32 s8, 1
; GFX11-NEXT: v_writelane_b32 v40, s9, 5
-; GFX11-NEXT: s_mov_b32 s9, 2
; GFX11-NEXT: v_writelane_b32 v40, s10, 6
-; GFX11-NEXT: s_mov_b32 s10, 3
; GFX11-NEXT: v_writelane_b32 v40, s11, 7
-; GFX11-NEXT: s_mov_b32 s11, 4
; GFX11-NEXT: v_writelane_b32 v40, s30, 8
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: s_mov_b32 s8, 1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s9, 2
+; GFX11-NEXT: s_mov_b32 s10, 3
+; GFX11-NEXT: s_mov_b32 s11, 4
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
@@ -11043,25 +11063,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10
-; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6
-; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7
-; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
+; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1
+; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 4
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
@@ -11098,13 +11118,13 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_f16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_f16_inreg at abs32@lo
; GFX9-NEXT: s_movk_i32 s4, 0x4400
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
@@ -11128,13 +11148,13 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_f16_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_f16_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x4400
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_f16_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_f16_inreg at abs32@lo
+; GFX10-NEXT: s_movk_i32 s4, 0x4400
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
@@ -11158,15 +11178,14 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_f16_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_f16_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x4400
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_f16_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_f16_inreg at abs32@lo
+; GFX11-NEXT: s_movk_i32 s4, 0x4400
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -11189,13 +11208,13 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f16_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
@@ -11223,13 +11242,13 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_f32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_f32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 4.0
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
@@ -11253,13 +11272,13 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_f32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_f32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 4.0
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_f32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_f32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 4.0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
@@ -11283,15 +11302,14 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_f32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_f32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 4.0
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_f32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_f32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 4.0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -11314,13 +11332,13 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
@@ -11348,15 +11366,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 1.0
; GFX9-NEXT: s_mov_b32 s5, 2.0
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -11381,15 +11399,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 4
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1.0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 1.0
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -11414,17 +11432,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 4
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -11448,15 +11465,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -11485,17 +11502,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 5
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 3
+; GFX9-NEXT: v_writelane_b32 v40, s31, 4
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 1.0
; GFX9-NEXT: s_mov_b32 s5, 2.0
; GFX9-NEXT: s_mov_b32 s6, 4.0
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 3
; GFX9-NEXT: v_readlane_b32 s31, v40, 4
@@ -11521,17 +11538,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 5
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1.0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 4.0
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-NEXT: v_writelane_b32 v40, s31, 4
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 1.0
+; GFX10-NEXT: s_mov_b32 s5, 2.0
+; GFX10-NEXT: s_mov_b32 s6, 4.0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 3
; GFX10-NEXT: v_readlane_b32 s31, v40, 4
@@ -11557,19 +11574,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 5
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 4.0
; GFX11-NEXT: v_writelane_b32 v40, s30, 3
; GFX11-NEXT: v_writelane_b32 v40, s31, 4
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: s_mov_b32 s5, 2.0
+; GFX11-NEXT: s_mov_b32 s6, 4.0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 3
; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
@@ -11594,17 +11610,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
@@ -11634,12 +11650,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 7
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s30, 5
+; GFX9-NEXT: v_writelane_b32 v40, s31, 6
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 1.0
@@ -11647,8 +11665,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX9-NEXT: s_mov_b32 s6, 4.0
; GFX9-NEXT: s_mov_b32 s7, -1.0
; GFX9-NEXT: s_mov_b32 s8, 0.5
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 6
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 5
; GFX9-NEXT: v_readlane_b32 s31, v40, 6
@@ -11676,21 +11692,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 7
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: v_writelane_b32 v40, s6, 2
+; GFX10-NEXT: v_writelane_b32 v40, s7, 3
+; GFX10-NEXT: v_writelane_b32 v40, s8, 4
+; GFX10-NEXT: v_writelane_b32 v40, s30, 5
+; GFX10-NEXT: v_writelane_b32 v40, s31, 6
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg at abs32@lo
; GFX10-NEXT: s_mov_b32 s4, 1.0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_mov_b32 s5, 2.0
-; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 4.0
-; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: s_mov_b32 s7, -1.0
-; GFX10-NEXT: v_writelane_b32 v40, s8, 4
; GFX10-NEXT: s_mov_b32 s8, 0.5
-; GFX10-NEXT: v_writelane_b32 v40, s30, 5
-; GFX10-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 5
; GFX10-NEXT: v_readlane_b32 s31, v40, 6
@@ -11718,23 +11734,22 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 7
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 4.0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_mov_b32 s7, -1.0
; GFX11-NEXT: v_writelane_b32 v40, s8, 4
-; GFX11-NEXT: s_mov_b32 s8, 0.5
; GFX11-NEXT: v_writelane_b32 v40, s30, 5
; GFX11-NEXT: v_writelane_b32 v40, s31, 6
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: s_mov_b32 s5, 2.0
+; GFX11-NEXT: s_mov_b32 s6, 4.0
+; GFX11-NEXT: s_mov_b32 s7, -1.0
+; GFX11-NEXT: s_mov_b32 s8, 0.5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 5
; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
@@ -11761,21 +11776,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s7, -1.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0.5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s7, -1.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0.5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
@@ -11807,15 +11822,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_mov_b32 s35, external_void_func_f64_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_f64_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s5, 0x40100000
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -11840,15 +11855,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 4
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_f64_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_f64_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_f64_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_f64_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -11873,17 +11888,16 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 4
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_f64_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_f64_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0x40100000
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_f64_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_f64_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s5, 0x40100000
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -11907,15 +11921,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f64_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -11944,19 +11958,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 6
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
+; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s5, 2.0
; GFX9-NEXT: s_mov_b32 s6, 0
; GFX9-NEXT: s_mov_b32 s7, 0x40100000
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
; GFX9-NEXT: v_readlane_b32 s31, v40, 5
@@ -11983,19 +11997,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 6
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_mov_b32 s7, 0x40100000
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s5, 2.0
+; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: s_mov_b32 s7, 0x40100000
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-NEXT: v_readlane_b32 s31, v40, 5
@@ -12022,21 +12036,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 6
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_mov_b32 s7, 0x40100000
; GFX11-NEXT: v_writelane_b32 v40, s30, 4
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s5, 2.0
+; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_mov_b32 s7, 0x40100000
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -12062,19 +12075,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
@@ -12105,6 +12118,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 8
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -12112,6 +12126,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
; GFX9-NEXT: v_writelane_b32 v40, s30, 6
+; GFX9-NEXT: v_writelane_b32 v40, s31, 7
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 0
@@ -12120,8 +12135,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX9-NEXT: s_mov_b32 s7, 0x40100000
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, 0x40200000
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 7
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 6
; GFX9-NEXT: v_readlane_b32 s31, v40, 7
@@ -12150,23 +12163,23 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 8
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_mov_b32 s7, 0x40100000
; GFX10-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: v_writelane_b32 v40, s9, 5
-; GFX10-NEXT: s_mov_b32 s9, 0x40200000
; GFX10-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-NEXT: v_writelane_b32 v40, s31, 7
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s5, 2.0
+; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: s_mov_b32 s7, 0x40100000
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_mov_b32 s9, 0x40200000
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 6
; GFX10-NEXT: v_readlane_b32 s31, v40, 7
@@ -12195,25 +12208,24 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 8
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_mov_b32 s7, 0x40100000
; GFX11-NEXT: v_writelane_b32 v40, s8, 4
-; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: v_writelane_b32 v40, s9, 5
-; GFX11-NEXT: s_mov_b32 s9, 0x40200000
; GFX11-NEXT: v_writelane_b32 v40, s30, 6
; GFX11-NEXT: v_writelane_b32 v40, s31, 7
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s5, 2.0
+; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_mov_b32 s7, 0x40100000
+; GFX11-NEXT: s_mov_b32 s8, 0
+; GFX11-NEXT: s_mov_b32 s9, 0x40200000
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 6
; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
@@ -12241,23 +12253,23 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 0x40200000
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000
+; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 0x40200000
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
@@ -12290,13 +12302,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
-; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
+; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i16_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
@@ -12322,11 +12334,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i16_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i16_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
-; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
@@ -12352,13 +12364,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
+; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i16_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i16_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
-; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -12383,11 +12395,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i16_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
@@ -12416,14 +12428,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -12451,11 +12463,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: v_writelane_b32 v40, s30, 2
+; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 2
-; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -12483,13 +12495,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: v_writelane_b32 v40, s30, 2
+; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 2
-; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -12516,11 +12528,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -12550,14 +12562,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -12585,11 +12597,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: v_writelane_b32 v40, s30, 2
+; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 2
-; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -12617,13 +12629,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: v_writelane_b32 v40, s30, 2
+; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 2
-; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -12650,11 +12662,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -12684,15 +12696,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 0x20001
; GFX9-NEXT: s_mov_b32 s5, 3
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -12717,15 +12729,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 4
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x20001
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 3
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-NEXT: s_mov_b32 s5, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -12750,17 +12762,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 4
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0x20001
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 3
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 0x20001
+; GFX11-NEXT: s_mov_b32 s5, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -12784,15 +12795,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -12821,15 +12832,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 0x40003c00
; GFX9-NEXT: s_movk_i32 s5, 0x4400
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -12854,15 +12865,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 4
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x40003c00
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_movk_i32 s5, 0x4400
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX10-NEXT: s_movk_i32 s5, 0x4400
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -12887,17 +12898,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 4
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: v_writelane_b32 v40, s30, 2
+; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b32 s4, 0x40003c00
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_movk_i32 s5, 0x4400
-; GFX11-NEXT: v_writelane_b32 v40, s30, 2
-; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -12921,15 +12931,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -12958,14 +12968,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -12993,11 +13003,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: v_writelane_b32 v40, s30, 2
+; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 2
-; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -13025,13 +13035,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: v_writelane_b32 v40, s30, 2
+; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 2
-; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -13058,11 +13068,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -13092,15 +13102,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 0x20001
; GFX9-NEXT: s_mov_b32 s5, 0x40003
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -13125,15 +13135,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 4
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x20001
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0x40003
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-NEXT: s_mov_b32 s5, 0x40003
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -13158,17 +13168,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 4
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0x20001
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0x40003
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 0x20001
+; GFX11-NEXT: s_mov_b32 s5, 0x40003
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -13192,15 +13201,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -13229,13 +13238,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
-; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
+; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f16_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f16_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
@@ -13261,11 +13270,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f16_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f16_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
-; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
@@ -13291,13 +13300,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
+; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f16_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f16_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
-; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -13322,11 +13331,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f16_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
@@ -13355,14 +13364,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -13390,11 +13399,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: v_writelane_b32 v40, s30, 2
+; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 2
-; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -13422,13 +13431,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: v_writelane_b32 v40, s30, 2
+; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 2
-; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -13455,11 +13464,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -13489,15 +13498,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 1
; GFX9-NEXT: s_mov_b32 s5, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
@@ -13522,15 +13531,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 4
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
@@ -13555,17 +13564,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 4
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -13589,15 +13597,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
@@ -13626,17 +13634,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 5
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 3
+; GFX9-NEXT: v_writelane_b32 v40, s31, 4
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 3
; GFX9-NEXT: s_mov_b32 s5, 4
; GFX9-NEXT: s_mov_b32 s6, 5
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 3
; GFX9-NEXT: v_readlane_b32 s31, v40, 4
@@ -13662,17 +13670,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 5
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 3
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 5
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-NEXT: v_writelane_b32 v40, s31, 4
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 3
+; GFX10-NEXT: s_mov_b32 s5, 4
+; GFX10-NEXT: s_mov_b32 s6, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 3
; GFX10-NEXT: v_readlane_b32 s31, v40, 4
@@ -13698,19 +13706,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 5
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 3
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 5
; GFX11-NEXT: v_writelane_b32 v40, s30, 3
; GFX11-NEXT: v_writelane_b32 v40, s31, 4
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 3
+; GFX11-NEXT: s_mov_b32 s5, 4
+; GFX11-NEXT: s_mov_b32 s6, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 3
; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
@@ -13735,17 +13742,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
@@ -13775,19 +13782,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 6
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
+; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 3
; GFX9-NEXT: s_mov_b32 s5, 4
; GFX9-NEXT: s_mov_b32 s6, 5
; GFX9-NEXT: s_mov_b32 s7, 6
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
; GFX9-NEXT: v_readlane_b32 s31, v40, 5
@@ -13814,19 +13821,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 6
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 3
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 5
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_mov_b32 s7, 6
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 3
+; GFX10-NEXT: s_mov_b32 s5, 4
+; GFX10-NEXT: s_mov_b32 s6, 5
+; GFX10-NEXT: s_mov_b32 s7, 6
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-NEXT: v_readlane_b32 s31, v40, 5
@@ -13853,21 +13860,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 6
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 3
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 5
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_mov_b32 s7, 6
; GFX11-NEXT: v_writelane_b32 v40, s30, 4
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 3
+; GFX11-NEXT: s_mov_b32 s5, 4
+; GFX11-NEXT: s_mov_b32 s6, 5
+; GFX11-NEXT: s_mov_b32 s7, 6
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -13893,19 +13899,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5
+; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 6
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
@@ -13936,16 +13942,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 6
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
+; GFX9-NEXT: v_writelane_b32 v40, s31, 5
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
; GFX9-NEXT: v_readlane_b32 s31, v40, 5
@@ -13977,11 +13983,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
+; GFX10-NEXT: v_writelane_b32 v40, s30, 4
+; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 4
-; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-NEXT: v_readlane_b32 s31, v40, 5
@@ -14013,13 +14019,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
+; GFX11-NEXT: v_writelane_b32 v40, s30, 4
+; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 4
-; GFX11-NEXT: v_writelane_b32 v40, s31, 5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -14050,11 +14056,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
@@ -14086,19 +14092,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 6
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
+; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 1
; GFX9-NEXT: s_mov_b32 s5, 2
; GFX9-NEXT: s_mov_b32 s6, 3
; GFX9-NEXT: s_mov_b32 s7, 4
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
; GFX9-NEXT: v_readlane_b32 s31, v40, 5
@@ -14125,19 +14131,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 6
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_mov_b32 s7, 4
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
+; GFX10-NEXT: s_mov_b32 s6, 3
+; GFX10-NEXT: s_mov_b32 s7, 4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-NEXT: v_readlane_b32 s31, v40, 5
@@ -14164,21 +14170,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 6
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_mov_b32 s7, 4
; GFX11-NEXT: v_writelane_b32 v40, s30, 4
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
+; GFX11-NEXT: s_mov_b32 s6, 3
+; GFX11-NEXT: s_mov_b32 s7, 4
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -14204,19 +14209,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
@@ -14247,12 +14252,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 7
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s30, 5
+; GFX9-NEXT: v_writelane_b32 v40, s31, 6
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 1
@@ -14260,8 +14267,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX9-NEXT: s_mov_b32 s6, 3
; GFX9-NEXT: s_mov_b32 s7, 4
; GFX9-NEXT: s_mov_b32 s8, 5
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 6
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 5
; GFX9-NEXT: v_readlane_b32 s31, v40, 6
@@ -14289,21 +14294,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 7
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_mov_b32 s7, 4
; GFX10-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-NEXT: s_mov_b32 s8, 5
; GFX10-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-NEXT: v_writelane_b32 v40, s31, 6
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
+; GFX10-NEXT: s_mov_b32 s6, 3
+; GFX10-NEXT: s_mov_b32 s7, 4
+; GFX10-NEXT: s_mov_b32 s8, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 5
; GFX10-NEXT: v_readlane_b32 s31, v40, 6
@@ -14331,23 +14336,22 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 7
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_mov_b32 s7, 4
; GFX11-NEXT: v_writelane_b32 v40, s8, 4
-; GFX11-NEXT: s_mov_b32 s8, 5
; GFX11-NEXT: v_writelane_b32 v40, s30, 5
; GFX11-NEXT: v_writelane_b32 v40, s31, 6
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
+; GFX11-NEXT: s_mov_b32 s6, 3
+; GFX11-NEXT: s_mov_b32 s7, 4
+; GFX11-NEXT: s_mov_b32 s8, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 5
; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
@@ -14374,21 +14378,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4
+; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
@@ -14420,22 +14424,22 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 10
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
; GFX9-NEXT: v_writelane_b32 v40, s10, 6
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
+; GFX9-NEXT: v_writelane_b32 v40, s30, 8
+; GFX9-NEXT: v_writelane_b32 v40, s31, 9
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0
-; GFX9-NEXT: v_writelane_b32 v40, s30, 8
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
; GFX9-NEXT: v_readlane_b32 s31, v40, 9
@@ -14466,7 +14470,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 10
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
@@ -14476,12 +14479,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s9, 5
; GFX10-NEXT: v_writelane_b32 v40, s10, 6
; GFX10-NEXT: v_writelane_b32 v40, s11, 7
+; GFX10-NEXT: v_writelane_b32 v40, s30, 8
+; GFX10-NEXT: v_writelane_b32 v40, s31, 9
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 8
-; GFX10-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
; GFX10-NEXT: v_readlane_b32 s31, v40, 9
@@ -14512,7 +14516,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 10
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
@@ -14522,14 +14525,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s9, 5
; GFX11-NEXT: v_writelane_b32 v40, s10, 6
; GFX11-NEXT: v_writelane_b32 v40, s11, 7
+; GFX11-NEXT: v_writelane_b32 v40, s30, 8
+; GFX11-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 8
-; GFX11-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
@@ -14559,7 +14563,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10
-; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
@@ -14569,12 +14572,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
+; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
@@ -14611,6 +14615,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 10
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -14620,6 +14625,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s10, 6
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
; GFX9-NEXT: v_writelane_b32 v40, s30, 8
+; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s4, 1
@@ -14630,8 +14636,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-NEXT: s_mov_b32 s9, 6
; GFX9-NEXT: s_mov_b32 s10, 7
; GFX9-NEXT: s_mov_b32 s11, 8
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
; GFX9-NEXT: v_readlane_b32 s31, v40, 9
@@ -14662,27 +14666,27 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 10
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-NEXT: s_mov_b32 s7, 4
; GFX10-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-NEXT: s_mov_b32 s8, 5
; GFX10-NEXT: v_writelane_b32 v40, s9, 5
-; GFX10-NEXT: s_mov_b32 s9, 6
; GFX10-NEXT: v_writelane_b32 v40, s10, 6
-; GFX10-NEXT: s_mov_b32 s10, 7
; GFX10-NEXT: v_writelane_b32 v40, s11, 7
-; GFX10-NEXT: s_mov_b32 s11, 8
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
+; GFX10-NEXT: s_mov_b32 s6, 3
+; GFX10-NEXT: s_mov_b32 s7, 4
+; GFX10-NEXT: s_mov_b32 s8, 5
+; GFX10-NEXT: s_mov_b32 s9, 6
+; GFX10-NEXT: s_mov_b32 s10, 7
+; GFX10-NEXT: s_mov_b32 s11, 8
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
; GFX10-NEXT: v_readlane_b32 s31, v40, 9
@@ -14713,29 +14717,28 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 10
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
-; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
-; GFX11-NEXT: s_mov_b32 s7, 4
; GFX11-NEXT: v_writelane_b32 v40, s8, 4
-; GFX11-NEXT: s_mov_b32 s8, 5
; GFX11-NEXT: v_writelane_b32 v40, s9, 5
-; GFX11-NEXT: s_mov_b32 s9, 6
; GFX11-NEXT: v_writelane_b32 v40, s10, 6
-; GFX11-NEXT: s_mov_b32 s10, 7
; GFX11-NEXT: v_writelane_b32 v40, s11, 7
-; GFX11-NEXT: s_mov_b32 s11, 8
; GFX11-NEXT: v_writelane_b32 v40, s30, 8
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
+; GFX11-NEXT: s_mov_b32 s6, 3
+; GFX11-NEXT: s_mov_b32 s7, 4
+; GFX11-NEXT: s_mov_b32 s8, 5
+; GFX11-NEXT: s_mov_b32 s9, 6
+; GFX11-NEXT: s_mov_b32 s10, 7
+; GFX11-NEXT: s_mov_b32 s11, 8
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
@@ -14765,27 +14768,27 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6
-; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 7
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7
-; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4
+; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5
+; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 6
+; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 7
+; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 8
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
@@ -14820,6 +14823,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 18
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -14831,19 +14835,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s12, 8
; GFX9-NEXT: v_writelane_b32 v40, s13, 9
; GFX9-NEXT: v_writelane_b32 v40, s14, 10
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s15, 11
; GFX9-NEXT: v_writelane_b32 v40, s16, 12
; GFX9-NEXT: v_writelane_b32 v40, s17, 13
; GFX9-NEXT: v_writelane_b32 v40, s18, 14
; GFX9-NEXT: v_writelane_b32 v40, s19, 15
+; GFX9-NEXT: v_writelane_b32 v40, s30, 16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 17
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX9-NEXT: v_writelane_b32 v40, s30, 16
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 17
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 16
; GFX9-NEXT: v_readlane_b32 s31, v40, 17
@@ -14882,7 +14885,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 18
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
@@ -14900,12 +14902,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s17, 13
; GFX10-NEXT: v_writelane_b32 v40, s18, 14
; GFX10-NEXT: v_writelane_b32 v40, s19, 15
+; GFX10-NEXT: v_writelane_b32 v40, s30, 16
+; GFX10-NEXT: v_writelane_b32 v40, s31, 17
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 16
-; GFX10-NEXT: v_writelane_b32 v40, s31, 17
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 16
; GFX10-NEXT: v_readlane_b32 s31, v40, 17
@@ -14944,7 +14947,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 18
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
@@ -14962,14 +14964,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s17, 13
; GFX11-NEXT: v_writelane_b32 v40, s18, 14
; GFX11-NEXT: v_writelane_b32 v40, s19, 15
+; GFX11-NEXT: v_writelane_b32 v40, s30, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 17
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16i32_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16i32_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s30, 16
-; GFX11-NEXT: v_writelane_b32 v40, s31, 17
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 16
; GFX11-NEXT: v_readlane_b32 s31, v40, 17
; GFX11-NEXT: v_readlane_b32 s19, v40, 15
@@ -15007,7 +15010,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 18
-; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
@@ -15025,12 +15027,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17
+; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16i32_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 16
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17
@@ -15075,6 +15078,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 28
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -15091,23 +15095,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s17, 13
; GFX9-NEXT: v_writelane_b32 v40, s18, 14
; GFX9-NEXT: v_writelane_b32 v40, s19, 15
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s20, 16
; GFX9-NEXT: v_writelane_b32 v40, s21, 17
; GFX9-NEXT: v_writelane_b32 v40, s22, 18
; GFX9-NEXT: v_writelane_b32 v40, s23, 19
; GFX9-NEXT: v_writelane_b32 v40, s24, 20
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s25, 21
; GFX9-NEXT: v_writelane_b32 v40, s26, 22
; GFX9-NEXT: v_writelane_b32 v40, s27, 23
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s28, 24
+; GFX9-NEXT: v_writelane_b32 v40, s29, 25
+; GFX9-NEXT: v_writelane_b32 v40, s30, 26
+; GFX9-NEXT: v_writelane_b32 v40, s31, 27
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40
+; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg at abs32@lo
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s46
-; GFX9-NEXT: v_writelane_b32 v40, s29, 25
; GFX9-NEXT: v_mov_b32_e32 v1, s47
; GFX9-NEXT: v_mov_b32_e32 v2, s48
; GFX9-NEXT: v_mov_b32_e32 v3, s49
@@ -15116,11 +15123,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12
; GFX9-NEXT: v_mov_b32_e32 v0, s50
-; GFX9-NEXT: v_writelane_b32 v40, s30, 26
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT: v_mov_b32_e32 v0, s51
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s20, s36
; GFX9-NEXT: s_mov_b32 s21, s37
; GFX9-NEXT: s_mov_b32 s22, s38
@@ -15131,7 +15135,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: s_mov_b32 s27, s43
; GFX9-NEXT: s_mov_b32 s28, s44
; GFX9-NEXT: s_mov_b32 s29, s45
-; GFX9-NEXT: v_writelane_b32 v40, s31, 27
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 26
@@ -15181,7 +15184,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 28
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
@@ -15199,29 +15201,40 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s17, 13
; GFX10-NEXT: v_writelane_b32 v40, s18, 14
; GFX10-NEXT: v_writelane_b32 v40, s19, 15
+; GFX10-NEXT: v_writelane_b32 v40, s20, 16
+; GFX10-NEXT: v_writelane_b32 v40, s21, 17
+; GFX10-NEXT: v_writelane_b32 v40, s22, 18
+; GFX10-NEXT: v_writelane_b32 v40, s23, 19
+; GFX10-NEXT: v_writelane_b32 v40, s24, 20
+; GFX10-NEXT: v_writelane_b32 v40, s25, 21
+; GFX10-NEXT: v_writelane_b32 v40, s26, 22
+; GFX10-NEXT: v_writelane_b32 v40, s27, 23
+; GFX10-NEXT: v_writelane_b32 v40, s28, 24
+; GFX10-NEXT: v_writelane_b32 v40, s29, 25
+; GFX10-NEXT: v_writelane_b32 v40, s30, 26
+; GFX10-NEXT: v_writelane_b32 v40, s31, 27
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s20, 16
-; GFX10-NEXT: v_writelane_b32 v40, s21, 17
-; GFX10-NEXT: v_writelane_b32 v40, s22, 18
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s46
-; GFX10-NEXT: v_writelane_b32 v40, s23, 19
; GFX10-NEXT: v_mov_b32_e32 v1, s47
; GFX10-NEXT: v_mov_b32_e32 v2, s48
; GFX10-NEXT: v_mov_b32_e32 v3, s49
; GFX10-NEXT: s_mov_b32 s20, s36
-; GFX10-NEXT: v_writelane_b32 v40, s24, 20
; GFX10-NEXT: s_mov_b32 s21, s37
; GFX10-NEXT: s_mov_b32 s22, s38
; GFX10-NEXT: s_mov_b32 s23, s39
; GFX10-NEXT: s_mov_b32 s24, s40
-; GFX10-NEXT: v_writelane_b32 v40, s25, 21
; GFX10-NEXT: s_mov_b32 s25, s41
+; GFX10-NEXT: s_mov_b32 s26, s42
+; GFX10-NEXT: s_mov_b32 s27, s43
+; GFX10-NEXT: s_mov_b32 s28, s44
+; GFX10-NEXT: s_mov_b32 s29, s45
; GFX10-NEXT: v_mov_b32_e32 v4, s50
; GFX10-NEXT: v_mov_b32_e32 v5, s51
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
@@ -15230,16 +15243,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12
; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16
; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20
-; GFX10-NEXT: v_writelane_b32 v40, s26, 22
-; GFX10-NEXT: s_mov_b32 s26, s42
-; GFX10-NEXT: v_writelane_b32 v40, s27, 23
-; GFX10-NEXT: s_mov_b32 s27, s43
-; GFX10-NEXT: v_writelane_b32 v40, s28, 24
-; GFX10-NEXT: s_mov_b32 s28, s44
-; GFX10-NEXT: v_writelane_b32 v40, s29, 25
-; GFX10-NEXT: s_mov_b32 s29, s45
-; GFX10-NEXT: v_writelane_b32 v40, s30, 26
-; GFX10-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 26
; GFX10-NEXT: v_readlane_b32 s31, v40, 27
@@ -15288,10 +15291,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 28
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s2, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
@@ -15308,42 +15308,43 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s17, 13
; GFX11-NEXT: v_writelane_b32 v40, s18, 14
; GFX11-NEXT: v_writelane_b32 v40, s19, 15
+; GFX11-NEXT: v_writelane_b32 v40, s20, 16
+; GFX11-NEXT: v_writelane_b32 v40, s21, 17
+; GFX11-NEXT: v_writelane_b32 v40, s22, 18
+; GFX11-NEXT: v_writelane_b32 v40, s23, 19
+; GFX11-NEXT: v_writelane_b32 v40, s24, 20
+; GFX11-NEXT: v_writelane_b32 v40, s25, 21
+; GFX11-NEXT: v_writelane_b32 v40, s26, 22
+; GFX11-NEXT: v_writelane_b32 v40, s27, 23
+; GFX11-NEXT: v_writelane_b32 v40, s28, 24
+; GFX11-NEXT: v_writelane_b32 v40, s29, 25
+; GFX11-NEXT: v_writelane_b32 v40, s30, 26
+; GFX11-NEXT: v_writelane_b32 v40, s31, 27
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_add_i32 s2, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s20, 16
-; GFX11-NEXT: v_writelane_b32 v40, s21, 17
-; GFX11-NEXT: v_writelane_b32 v40, s22, 18
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51
-; GFX11-NEXT: v_writelane_b32 v40, s23, 19
; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v1, s47
; GFX11-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
-; GFX11-NEXT: v_writelane_b32 v40, s24, 20
; GFX11-NEXT: s_mov_b32 s20, s36
; GFX11-NEXT: s_mov_b32 s21, s37
; GFX11-NEXT: s_mov_b32 s22, s38
; GFX11-NEXT: s_mov_b32 s23, s39
-; GFX11-NEXT: v_writelane_b32 v40, s25, 21
; GFX11-NEXT: s_mov_b32 s24, s40
; GFX11-NEXT: s_mov_b32 s25, s41
-; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2
-; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT: v_writelane_b32 v40, s26, 22
; GFX11-NEXT: s_mov_b32 s26, s42
-; GFX11-NEXT: v_writelane_b32 v40, s27, 23
; GFX11-NEXT: s_mov_b32 s27, s43
-; GFX11-NEXT: v_writelane_b32 v40, s28, 24
; GFX11-NEXT: s_mov_b32 s28, s44
-; GFX11-NEXT: v_writelane_b32 v40, s29, 25
; GFX11-NEXT: s_mov_b32 s29, s45
-; GFX11-NEXT: v_writelane_b32 v40, s30, 26
-; GFX11-NEXT: v_writelane_b32 v40, s31, 27
+; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2
+; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 26
; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s29, v40, 25
@@ -15391,9 +15392,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28
-; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
@@ -15410,43 +15409,45 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27
+; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40
; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49
; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36
; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37
; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21
; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39
; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40
; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s2
-; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22
; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23
; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24
; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25
; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27
+; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s2
+; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
@@ -15501,6 +15502,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 28
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -15516,42 +15518,42 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s16, 12
; GFX9-NEXT: v_writelane_b32 v40, s17, 13
; GFX9-NEXT: v_writelane_b32 v40, s18, 14
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s19, 15
; GFX9-NEXT: v_writelane_b32 v40, s20, 16
; GFX9-NEXT: v_writelane_b32 v40, s21, 17
; GFX9-NEXT: v_writelane_b32 v40, s22, 18
; GFX9-NEXT: v_writelane_b32 v40, s23, 19
+; GFX9-NEXT: v_writelane_b32 v40, s24, 20
+; GFX9-NEXT: v_writelane_b32 v40, s25, 21
+; GFX9-NEXT: v_writelane_b32 v40, s26, 22
+; GFX9-NEXT: v_writelane_b32 v40, s27, 23
+; GFX9-NEXT: v_writelane_b32 v40, s28, 24
+; GFX9-NEXT: v_writelane_b32 v40, s29, 25
+; GFX9-NEXT: v_writelane_b32 v40, s30, 26
+; GFX9-NEXT: v_writelane_b32 v40, s31, 27
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s52, s[34:35], 0x0
; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35
; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35
; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX9-NEXT: v_writelane_b32 v40, s24, 20
-; GFX9-NEXT: v_writelane_b32 v40, s25, 21
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s26, 22
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s52
-; GFX9-NEXT: v_writelane_b32 v40, s27, 23
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT: v_mov_b32_e32 v0, s46
-; GFX9-NEXT: v_writelane_b32 v40, s28, 24
; GFX9-NEXT: v_mov_b32_e32 v1, s47
; GFX9-NEXT: v_mov_b32_e32 v2, s48
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, s49
-; GFX9-NEXT: v_writelane_b32 v40, s29, 25
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT: v_mov_b32_e32 v0, s50
-; GFX9-NEXT: v_writelane_b32 v40, s30, 26
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT: v_mov_b32_e32 v0, s51
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
; GFX9-NEXT: s_mov_b32 s20, s36
; GFX9-NEXT: s_mov_b32 s21, s37
; GFX9-NEXT: s_mov_b32 s22, s38
@@ -15562,7 +15564,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: s_mov_b32 s27, s43
; GFX9-NEXT: s_mov_b32 s28, s44
; GFX9-NEXT: s_mov_b32 s29, s45
-; GFX9-NEXT: v_writelane_b32 v40, s31, 27
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 26
@@ -15612,7 +15613,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 28
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
@@ -15630,6 +15630,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s17, 13
; GFX10-NEXT: v_writelane_b32 v40, s18, 14
; GFX10-NEXT: v_writelane_b32 v40, s19, 15
+; GFX10-NEXT: v_writelane_b32 v40, s20, 16
+; GFX10-NEXT: v_writelane_b32 v40, s21, 17
+; GFX10-NEXT: v_writelane_b32 v40, s22, 18
+; GFX10-NEXT: v_writelane_b32 v40, s23, 19
+; GFX10-NEXT: v_writelane_b32 v40, s24, 20
+; GFX10-NEXT: v_writelane_b32 v40, s25, 21
+; GFX10-NEXT: v_writelane_b32 v40, s26, 22
+; GFX10-NEXT: v_writelane_b32 v40, s27, 23
+; GFX10-NEXT: v_writelane_b32 v40, s28, 24
+; GFX10-NEXT: v_writelane_b32 v40, s29, 25
+; GFX10-NEXT: v_writelane_b32 v40, s30, 26
+; GFX10-NEXT: v_writelane_b32 v40, s31, 27
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0
@@ -15639,28 +15652,24 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s20, 16
-; GFX10-NEXT: v_writelane_b32 v40, s21, 17
-; GFX10-NEXT: v_writelane_b32 v40, s22, 18
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s52
-; GFX10-NEXT: v_writelane_b32 v40, s23, 19
; GFX10-NEXT: v_mov_b32_e32 v1, s47
; GFX10-NEXT: v_mov_b32_e32 v2, s48
; GFX10-NEXT: v_mov_b32_e32 v3, s49
+; GFX10-NEXT: s_mov_b32 s20, s36
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
-; GFX10-NEXT: v_writelane_b32 v40, s24, 20
; GFX10-NEXT: v_mov_b32_e32 v0, s46
-; GFX10-NEXT: s_mov_b32 s20, s36
; GFX10-NEXT: s_mov_b32 s21, s37
; GFX10-NEXT: s_mov_b32 s22, s38
-; GFX10-NEXT: v_writelane_b32 v40, s25, 21
; GFX10-NEXT: s_mov_b32 s23, s39
; GFX10-NEXT: s_mov_b32 s24, s40
; GFX10-NEXT: s_mov_b32 s25, s41
-; GFX10-NEXT: v_mov_b32_e32 v4, s50
-; GFX10-NEXT: v_writelane_b32 v40, s26, 22
; GFX10-NEXT: s_mov_b32 s26, s42
+; GFX10-NEXT: s_mov_b32 s27, s43
+; GFX10-NEXT: s_mov_b32 s28, s44
+; GFX10-NEXT: s_mov_b32 s29, s45
+; GFX10-NEXT: v_mov_b32_e32 v4, s50
; GFX10-NEXT: v_mov_b32_e32 v5, s51
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
@@ -15668,14 +15677,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12
; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16
; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20
-; GFX10-NEXT: v_writelane_b32 v40, s27, 23
-; GFX10-NEXT: s_mov_b32 s27, s43
-; GFX10-NEXT: v_writelane_b32 v40, s28, 24
-; GFX10-NEXT: s_mov_b32 s28, s44
-; GFX10-NEXT: v_writelane_b32 v40, s29, 25
-; GFX10-NEXT: s_mov_b32 s29, s45
-; GFX10-NEXT: v_writelane_b32 v40, s30, 26
-; GFX10-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 26
; GFX10-NEXT: v_readlane_b32 s31, v40, 27
@@ -15724,10 +15725,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 28
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s3, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
@@ -15744,6 +15742,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s17, 13
; GFX11-NEXT: v_writelane_b32 v40, s18, 14
; GFX11-NEXT: v_writelane_b32 v40, s19, 15
+; GFX11-NEXT: v_writelane_b32 v40, s20, 16
+; GFX11-NEXT: v_writelane_b32 v40, s21, 17
+; GFX11-NEXT: v_writelane_b32 v40, s22, 18
+; GFX11-NEXT: v_writelane_b32 v40, s23, 19
+; GFX11-NEXT: v_writelane_b32 v40, s24, 20
+; GFX11-NEXT: v_writelane_b32 v40, s25, 21
+; GFX11-NEXT: v_writelane_b32 v40, s26, 22
+; GFX11-NEXT: v_writelane_b32 v40, s27, 23
+; GFX11-NEXT: v_writelane_b32 v40, s28, 24
+; GFX11-NEXT: v_writelane_b32 v40, s29, 25
+; GFX11-NEXT: v_writelane_b32 v40, s30, 26
+; GFX11-NEXT: v_writelane_b32 v40, s31, 27
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_add_i32 s3, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
@@ -15751,39 +15763,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32_inreg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s20, 16
-; GFX11-NEXT: v_writelane_b32 v40, s21, 17
-; GFX11-NEXT: v_writelane_b32 v40, s22, 18
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v5, s51
-; GFX11-NEXT: v_writelane_b32 v40, s23, 19
; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v1, s47
; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49
-; GFX11-NEXT: v_writelane_b32 v40, s24, 20
; GFX11-NEXT: v_mov_b32_e32 v2, s48
; GFX11-NEXT: s_add_i32 s2, s32, 24
; GFX11-NEXT: s_mov_b32 s20, s36
; GFX11-NEXT: s_mov_b32 s21, s37
-; GFX11-NEXT: v_writelane_b32 v40, s25, 21
; GFX11-NEXT: s_mov_b32 s22, s38
; GFX11-NEXT: s_mov_b32 s23, s39
; GFX11-NEXT: s_mov_b32 s24, s40
; GFX11-NEXT: s_mov_b32 s25, s41
-; GFX11-NEXT: v_writelane_b32 v40, s26, 22
; GFX11-NEXT: s_mov_b32 s26, s42
-; GFX11-NEXT: scratch_store_b32 off, v6, s2
-; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3
-; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT: v_writelane_b32 v40, s27, 23
; GFX11-NEXT: s_mov_b32 s27, s43
-; GFX11-NEXT: v_writelane_b32 v40, s28, 24
; GFX11-NEXT: s_mov_b32 s28, s44
-; GFX11-NEXT: v_writelane_b32 v40, s29, 25
; GFX11-NEXT: s_mov_b32 s29, s45
-; GFX11-NEXT: v_writelane_b32 v40, s30, 26
-; GFX11-NEXT: v_writelane_b32 v40, s31, 27
+; GFX11-NEXT: scratch_store_b32 off, v6, s2
+; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3
+; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 26
; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s29, v40, 25
@@ -15831,9 +15830,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28
-; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: s_add_i32 s3, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
@@ -15850,6 +15847,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27
+; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: s_add_i32 s3, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x2
; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0
@@ -15859,40 +15870,28 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49
; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24
; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21
; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37
; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38
; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39
; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22
; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41
; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42
-; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s2
-; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s3
-; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23
; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24
; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25
; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s2
+; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s3
+; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
@@ -15947,14 +15946,14 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4
; GFX9-NEXT: s_mov_b32 s35, stack_passed_f64_arg at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, stack_passed_f64_arg at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -15980,19 +15979,19 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_mov_b32 s35, stack_passed_f64_arg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, stack_passed_f64_arg at abs32@lo
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -16014,13 +16013,13 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_mov_b32 s1, stack_passed_f64_arg at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, stack_passed_f64_arg at abs32@lo
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33
+; GFX11-NEXT: s_mov_b32 s1, stack_passed_f64_arg at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, stack_passed_f64_arg at abs32@lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -16044,13 +16043,13 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, stack_passed_f64_arg at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, stack_passed_f64_arg at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, stack_passed_f64_arg at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, stack_passed_f64_arg at abs32@lo
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -16079,16 +16078,17 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 12
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 14
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT: s_mov_b32 s35, external_void_func_12xv3i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_12xv3i32 at abs32@lo
@@ -16124,7 +16124,6 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v29, 9
; GFX9-NEXT: v_mov_b32_e32 v30, 10
; GFX9-NEXT: v_mov_b32_e32 v31, 11
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -16147,12 +16146,14 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 12
; GFX10-NEXT: v_mov_b32_e32 v1, 13
; GFX10-NEXT: v_mov_b32_e32 v2, 14
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_mov_b32_e32 v3, 15
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 1
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
@@ -16161,7 +16162,6 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 1
-; GFX10-NEXT: v_mov_b32_e32 v4, 1
; GFX10-NEXT: v_mov_b32_e32 v5, 1
; GFX10-NEXT: v_mov_b32_e32 v6, 2
; GFX10-NEXT: v_mov_b32_e32 v7, 2
@@ -16191,7 +16191,6 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-NEXT: v_mov_b32_e32 v31, 11
; GFX10-NEXT: s_mov_b32 s35, external_void_func_12xv3i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_12xv3i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -16214,15 +16213,16 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
-; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
+; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1
+; GFX11-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, 2
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1
-; GFX11-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, 2
; GFX11-NEXT: v_dual_mov_b32 v8, 2 :: v_dual_mov_b32 v9, 3
; GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_mov_b32 v11, 3
; GFX11-NEXT: v_dual_mov_b32 v12, 4 :: v_dual_mov_b32 v13, 4
@@ -16237,9 +16237,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX11-NEXT: v_dual_mov_b32 v30, 10 :: v_dual_mov_b32 v31, 11
; GFX11-NEXT: s_mov_b32 s1, external_void_func_12xv3i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_12xv3i32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -16261,21 +16260,22 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 1
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v9, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v10, 3
@@ -16302,7 +16302,6 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 11
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_12xv3i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_12xv3i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -16341,7 +16340,10 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v0, 9
@@ -16355,10 +16357,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT: v_mov_b32_e32 v0, 14
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; GFX9-NEXT: s_mov_b32 s35, external_void_func_8xv5i32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_8xv5i32 at abs32@lo
@@ -16394,7 +16394,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v29, 5
; GFX9-NEXT: v_mov_b32_e32 v30, 6
; GFX9-NEXT: v_mov_b32_e32 v31, 7
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -16416,20 +16415,22 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 8
; GFX10-NEXT: v_mov_b32_e32 v1, 9
; GFX10-NEXT: v_mov_b32_e32 v2, 10
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: v_mov_b32_e32 v3, 14
+; GFX10-NEXT: v_mov_b32_e32 v4, 15
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX10-NEXT: v_mov_b32_e32 v0, 11
; GFX10-NEXT: v_mov_b32_e32 v1, 12
; GFX10-NEXT: v_mov_b32_e32 v2, 13
-; GFX10-NEXT: v_mov_b32_e32 v3, 14
-; GFX10-NEXT: v_mov_b32_e32 v4, 15
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_mov_b32_e32 v5, 1
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
@@ -16440,7 +16441,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: v_mov_b32_e32 v5, 1
; GFX10-NEXT: v_mov_b32_e32 v6, 1
; GFX10-NEXT: v_mov_b32_e32 v7, 1
; GFX10-NEXT: v_mov_b32_e32 v8, 1
@@ -16469,7 +16469,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-NEXT: v_mov_b32_e32 v31, 7
; GFX10-NEXT: s_mov_b32 s35, external_void_func_8xv5i32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_8xv5i32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -16492,12 +16491,13 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9
; GFX11-NEXT: v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11
; GFX11-NEXT: v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13
; GFX11-NEXT: v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0
@@ -16519,9 +16519,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_dual_mov_b32 v30, 6 :: v_dual_mov_b32 v31, 7
; GFX11-NEXT: s_mov_b32 s1, external_void_func_8xv5i32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_8xv5i32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -16543,6 +16542,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 8
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 9
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 10
@@ -16551,8 +16553,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 13
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 14
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 15
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0
@@ -16590,7 +16590,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 7
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_8xv5i32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_8xv5i32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -16625,7 +16624,10 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-NEXT: v_writelane_b32 v40, s34, 2
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41100000
@@ -16639,10 +16641,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28
; GFX9-NEXT: s_mov_b32 s35, external_void_func_8xv5f32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_8xv5f32 at abs32@lo
@@ -16678,7 +16678,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000
; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000
; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -16700,20 +16699,22 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000
; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000
+; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000
; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000
; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000
-; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000
-; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_mov_b32_e32 v5, 1.0
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
@@ -16724,7 +16725,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: v_mov_b32_e32 v5, 1.0
; GFX10-NEXT: v_mov_b32_e32 v6, 1.0
; GFX10-NEXT: v_mov_b32_e32 v7, 1.0
; GFX10-NEXT: v_mov_b32_e32 v8, 1.0
@@ -16753,7 +16753,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-NEXT: v_mov_b32_e32 v31, 0x40e00000
; GFX10-NEXT: s_mov_b32 s35, external_void_func_8xv5f32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_8xv5f32 at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -16776,6 +16775,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x41100000
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41200000
@@ -16784,8 +16786,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_mov_b32_e32 v5, 0x41500000
; GFX11-NEXT: v_mov_b32_e32 v6, 0x41600000
; GFX11-NEXT: v_mov_b32_e32 v7, 0x41700000
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0
@@ -16808,9 +16808,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_mov_b32_e32 v31, 0x40e00000
; GFX11-NEXT: s_mov_b32 s1, external_void_func_8xv5f32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_8xv5f32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -16832,6 +16831,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41100000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41200000
@@ -16840,8 +16842,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41500000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41600000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41700000
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0
@@ -16879,7 +16879,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 0x40e00000
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_8xv5f32 at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_8xv5f32 at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -16916,10 +16915,10 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -16942,11 +16941,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -16969,13 +16968,13 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -16997,11 +16996,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17030,10 +17029,10 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17056,11 +17055,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17083,13 +17082,13 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -17111,11 +17110,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17144,10 +17143,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17170,11 +17169,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17197,13 +17196,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -17225,11 +17224,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17258,10 +17257,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17284,11 +17283,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17311,13 +17310,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -17339,11 +17338,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17372,10 +17371,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17398,11 +17397,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17425,13 +17424,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -17453,11 +17452,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17486,10 +17485,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17512,11 +17511,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17539,13 +17538,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -17567,11 +17566,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17600,10 +17599,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17626,11 +17625,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17653,13 +17652,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -17681,11 +17680,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17714,10 +17713,10 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17740,11 +17739,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17767,13 +17766,13 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -17795,11 +17794,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17828,10 +17827,10 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17854,11 +17853,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17881,13 +17880,13 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -17909,11 +17908,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -17942,10 +17941,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -17968,11 +17967,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -17995,13 +17994,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -18023,11 +18022,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -18056,10 +18055,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -18082,11 +18081,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -18109,13 +18108,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -18137,11 +18136,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -18170,10 +18169,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -18196,11 +18195,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -18223,13 +18222,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -18251,11 +18250,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -18284,10 +18283,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -18310,11 +18309,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -18337,13 +18336,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -18365,11 +18364,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
@@ -18398,10 +18397,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -18424,11 +18423,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -18451,13 +18450,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -18479,11 +18478,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 260398a519660..d43e47e5a4b70 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -15,13 +15,13 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 4
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
+; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_mov_b32 s5, external_void_func_void at abs32@hi
; GFX9-NEXT: s_mov_b32 s4, external_void_func_void at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
@@ -51,11 +51,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX10-NEXT: v_writelane_b32 v40, s34, 4
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, external_void_func_void at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, external_void_func_void at abs32@hi
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
+; GFX10-NEXT: s_mov_b32 s5, external_void_func_void at abs32@hi
+; GFX10-NEXT: s_mov_b32 s4, external_void_func_void at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
@@ -85,16 +85,16 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX11-NEXT: v_writelane_b32 v40, s0, 4
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, external_void_func_void at abs32@lo
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, external_void_func_void at abs32@hi
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
+; GFX11-NEXT: s_mov_b32 s5, external_void_func_void at abs32@hi
+; GFX11-NEXT: s_mov_b32 s4, external_void_func_void at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -209,12 +209,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s31
; GFX9-NEXT: ;;#ASMEND
@@ -246,12 +246,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s31
; GFX10-NEXT: ;;#ASMEND
@@ -283,12 +283,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s31
; GFX11-NEXT: ;;#ASMEND
@@ -325,12 +325,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v41, s34, 2
-; GFX9-NEXT: v_writelane_b32 v41, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v41, s30, 0
; GFX9-NEXT: v_writelane_b32 v41, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def v31
; GFX9-NEXT: ;;#ASMEND
@@ -362,16 +362,16 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v41, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v41, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def v31
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_mov_b32_e32 v40, v31
-; GFX10-NEXT: v_writelane_b32 v41, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_mov_b32_e32 v31, v40
; GFX10-NEXT: ;;#ASMSTART
@@ -399,18 +399,18 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v41, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: v_writelane_b32 v41, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def v31
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: v_mov_b32_e32 v40, v31
-; GFX11-NEXT: v_writelane_b32 v41, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v31, v40
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use v31
@@ -443,12 +443,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s33
; GFX9-NEXT: ;;#ASMEND
@@ -480,16 +480,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s33
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, s33
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
-; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_mov_b32 s33, s4
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
@@ -517,19 +517,18 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
+; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s33
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b32 s4, s33
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
-; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_mov_b32 s33, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s33
@@ -560,16 +559,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s34
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: s_mov_b32 s4, s34
; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: s_mov_b32 s34, s4
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
@@ -597,16 +596,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s34
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, s34
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
-; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_mov_b32 s34, s4
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
@@ -634,18 +633,17 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
+; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s34
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b32 s4, s34
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
-; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: s_mov_b32 s34, s4
; GFX11-NEXT: ;;#ASMSTART
@@ -677,12 +675,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v41, s34, 2
-; GFX9-NEXT: v_writelane_b32 v41, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v41, s30, 0
; GFX9-NEXT: v_writelane_b32 v41, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def v40
; GFX9-NEXT: ;;#ASMEND
@@ -712,15 +710,15 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v41, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v41, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def v40
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v41, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use v40
@@ -747,15 +745,15 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v41, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: v_writelane_b32 v41, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def v40
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v41, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use v40
@@ -844,10 +842,10 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, void_func_void_clobber_s33 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, void_func_void_clobber_s33 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, void_func_void_clobber_s33 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, void_func_void_clobber_s33 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -870,11 +868,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, void_func_void_clobber_s33 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, void_func_void_clobber_s33 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, void_func_void_clobber_s33 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, void_func_void_clobber_s33 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -897,13 +895,13 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, void_func_void_clobber_s33 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, void_func_void_clobber_s33 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, void_func_void_clobber_s33 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, void_func_void_clobber_s33 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -929,10 +927,10 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, void_func_void_clobber_s34 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, void_func_void_clobber_s34 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, void_func_void_clobber_s34 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, void_func_void_clobber_s34 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -955,11 +953,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: s_mov_b32 s35, void_func_void_clobber_s34 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, void_func_void_clobber_s34 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, void_func_void_clobber_s34 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, void_func_void_clobber_s34 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
@@ -982,13 +980,13 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: s_mov_b32 s1, void_func_void_clobber_s34 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, void_func_void_clobber_s34 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, void_func_void_clobber_s34 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, void_func_void_clobber_s34 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -1013,12 +1011,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
+; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s40
; GFX9-NEXT: ;;#ASMEND
@@ -1049,16 +1047,16 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s40
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, s40
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
-; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: ;;#ASMSTART
@@ -1085,18 +1083,17 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
+; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s40
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b32 s4, s40
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
-; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s4
@@ -1127,13 +1124,13 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v41, s34, 3
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v41, s4, 0
; GFX9-NEXT: v_writelane_b32 v41, s30, 1
+; GFX9-NEXT: v_writelane_b32 v41, s31, 2
; GFX9-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v41, s31, 2
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s40
; GFX9-NEXT: ;;#ASMEND
@@ -1172,11 +1169,13 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v41, s34, 3
-; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v41, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s30, 1
+; GFX10-NEXT: v_writelane_b32 v41, s31, 2
+; GFX10-NEXT: s_mov_b32 s35, external_void_func_void at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, external_void_func_void at abs32@lo
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s40
; GFX10-NEXT: ;;#ASMEND
@@ -1185,8 +1184,6 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX10-NEXT: ; def v32
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_mov_b32_e32 v40, v32
-; GFX10-NEXT: v_writelane_b32 v41, s30, 1
-; GFX10-NEXT: v_writelane_b32 v41, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
@@ -1217,11 +1214,13 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v41, s0, 3
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: v_writelane_b32 v41, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s30, 1
+; GFX11-NEXT: v_writelane_b32 v41, s31, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_void at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_void at abs32@lo
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s40
; GFX11-NEXT: ;;#ASMEND
@@ -1230,8 +1229,6 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX11-NEXT: ; def v32
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: v_mov_b32_e32 v40, v32
-; GFX11-NEXT: v_writelane_b32 v41, s30, 1
-; GFX11-NEXT: v_writelane_b32 v41, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s4
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 0b54bbd7e2105..20490572114fe 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -29,10 +29,10 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_writelane_b32 v1, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, return_i1 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, return_i1 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v1, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, return_i1 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, return_i1 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
; GFX9-NEXT: v_readlane_b32 s31, v1, 1
@@ -54,10 +54,10 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_writelane_b32 v1, s30, 0
-; GFX10-NEXT: s_mov_b32 s35, return_i1 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, return_i1 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, return_i1 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, return_i1 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
; GFX10-NEXT: v_readlane_b32 s31, v1, 1
@@ -79,12 +79,12 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v1, s30, 0
-; GFX11-NEXT: s_mov_b32 s1, return_i1 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, return_i1 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, return_i1 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, return_i1 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -131,10 +131,10 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_writelane_b32 v1, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, return_i16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, return_i16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v1, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, return_i16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, return_i16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
; GFX9-NEXT: v_readlane_b32 s31, v1, 1
@@ -156,10 +156,10 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_writelane_b32 v1, s30, 0
-; GFX10-NEXT: s_mov_b32 s35, return_i16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, return_i16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, return_i16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, return_i16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
; GFX10-NEXT: v_readlane_b32 s31, v1, 1
@@ -181,12 +181,12 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v1, s30, 0
-; GFX11-NEXT: s_mov_b32 s1, return_i16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, return_i16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, return_i16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, return_i16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -227,10 +227,10 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_writelane_b32 v1, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, return_2xi16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, return_2xi16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v1, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, return_2xi16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, return_2xi16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
; GFX9-NEXT: v_readlane_b32 s31, v1, 1
@@ -252,10 +252,10 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_writelane_b32 v1, s30, 0
-; GFX10-NEXT: s_mov_b32 s35, return_2xi16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, return_2xi16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, return_2xi16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, return_2xi16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
; GFX10-NEXT: v_readlane_b32 s31, v1, 1
@@ -277,12 +277,12 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v1, s30, 0
-; GFX11-NEXT: s_mov_b32 s1, return_2xi16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, return_2xi16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, return_2xi16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, return_2xi16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -331,10 +331,10 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_writelane_b32 v2, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, return_3xi16 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, return_3xi16 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, return_3xi16 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, return_3xi16 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
@@ -356,10 +356,10 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_writelane_b32 v2, s30, 0
-; GFX10-NEXT: s_mov_b32 s35, return_3xi16 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, return_3xi16 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, return_3xi16 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, return_3xi16 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
@@ -381,12 +381,12 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v2, s30, 0
-; GFX11-NEXT: s_mov_b32 s1, return_3xi16 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, return_3xi16 at abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, return_3xi16 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, return_3xi16 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -680,9 +680,6 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v100, s30, 0
-; GFX9-NEXT: s_mov_b32 s35, return_100xi32 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s34, return_100xi32 at abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x2400
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
@@ -716,7 +713,10 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX9-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v100, s30, 0
; GFX9-NEXT: v_writelane_b32 v100, s31, 1
+; GFX9-NEXT: s_mov_b32 s35, return_100xi32 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s34, return_100xi32 at abs32@lo
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: buffer_load_dword v95, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -769,9 +769,6 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX10-NEXT: buffer_store_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v100, s30, 0
-; GFX10-NEXT: s_mov_b32 s35, return_100xi32 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s34, return_100xi32 at abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x1200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
@@ -805,7 +802,10 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX10-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v100, s30, 0
; GFX10-NEXT: v_writelane_b32 v100, s31, 1
+; GFX10-NEXT: s_mov_b32 s35, return_100xi32 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s34, return_100xi32 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v95, off, s[0:3], s33
@@ -859,44 +859,76 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v100, s33 offset:128 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v100, s30, 0
-; GFX11-NEXT: s_mov_b32 s1, return_100xi32 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, return_100xi32 at abs32@lo
; GFX11-NEXT: s_addk_i32 s32, 0x90
; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:124
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:120
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:116
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:112
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:108
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:104
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:100
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:96
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:92
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:88
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:84
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v59, s33 offset:80
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:76
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v61, s33 offset:72
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v62, s33 offset:68
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v63, s33 offset:64
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v72, s33 offset:60
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v73, s33 offset:56
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v74, s33 offset:52
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v75, s33 offset:48
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v76, s33 offset:44
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v77, s33 offset:40
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v78, s33 offset:36
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v79, s33 offset:32
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v88, s33 offset:28
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v89, s33 offset:24
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v90, s33 offset:20
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v91, s33 offset:16
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v92, s33 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v93, s33 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v94, s33 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v95, s33
+; GFX11-NEXT: v_writelane_b32 v100, s30, 0
; GFX11-NEXT: v_writelane_b32 v100, s31, 1
+; GFX11-NEXT: s_mov_b32 s1, return_100xi32 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, return_100xi32 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v95, off, s33
@@ -2143,13 +2175,13 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v2, s30, 0
-; GFX9-NEXT: s_mov_b32 s37, return_512xi32 at abs32@hi
-; GFX9-NEXT: s_mov_b32 s36, return_512xi32 at abs32@lo
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX9-NEXT: s_mov_b32 s38, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_add_i32 s32, s32, 0x60000
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_mov_b32 s37, return_512xi32 at abs32@hi
+; GFX9-NEXT: s_mov_b32 s36, return_512xi32 at abs32@lo
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
@@ -2173,13 +2205,13 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s36
; GFX10-NEXT: v_writelane_b32 v2, s30, 0
-; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT: s_mov_b32 s37, return_512xi32 at abs32@hi
-; GFX10-NEXT: s_mov_b32 s36, return_512xi32 at abs32@lo
; GFX10-NEXT: s_mov_b32 s38, s34
; GFX10-NEXT: s_mov_b32 s34, s32
; GFX10-NEXT: s_add_i32 s32, s32, 0x30000
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
+; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT: s_mov_b32 s37, return_512xi32 at abs32@hi
+; GFX10-NEXT: s_mov_b32 s36, return_512xi32 at abs32@lo
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
@@ -2204,15 +2236,15 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v5, s33 offset:2048 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v5, s30, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s33
-; GFX11-NEXT: s_mov_b32 s1, return_512xi32 at abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, return_512xi32 at abs32@lo
; GFX11-NEXT: s_mov_b32 s36, s34
; GFX11-NEXT: s_mov_b32 s34, s32
; GFX11-NEXT: s_addk_i32 s32, 0x1800
; GFX11-NEXT: v_writelane_b32 v5, s31, 1
+; GFX11-NEXT: v_mov_b32_e32 v0, s33
+; GFX11-NEXT: s_mov_b32 s1, return_512xi32 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, return_512xi32 at abs32@lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v5, 0
; GFX11-NEXT: v_readlane_b32 s31, v5, 1
; GFX11-NEXT: s_mov_b32 s32, s34
@@ -2520,17 +2552,29 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:200
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:196
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:192
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:188
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:184
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:180
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:176
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164
; GFX11-NEXT: s_clause 0x11
; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16
@@ -2640,6 +2684,23 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: s_mov_b32 s38, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_add_i32 s32, s32, 0x28000
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v63, s30, 0
+; GFX9-NEXT: v_writelane_b32 v63, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
@@ -2683,7 +2744,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT: v_writelane_b32 v63, s30, 0
; GFX9-NEXT: s_mov_b32 s37, return_72xi32 at abs32@hi
; GFX9-NEXT: s_mov_b32 s36, return_72xi32 at abs32@lo
; GFX9-NEXT: v_add_u32_e32 v0, 0x200, v0
@@ -2718,22 +2778,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: v_mov_b32_e32 v29, 0
; GFX9-NEXT: v_mov_b32_e32 v30, 0
; GFX9-NEXT: v_mov_b32_e32 v31, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v63, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640
@@ -2910,11 +2954,31 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s36
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b32 s38, s34
; GFX10-NEXT: s_mov_b32 s34, s32
; GFX10-NEXT: s_add_i32 s32, s32, 0x14000
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v63, s30, 0
+; GFX10-NEXT: v_writelane_b32 v63, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2957,15 +3021,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: v_mov_b32_e32 v7, 0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_mov_b32_e32 v10, 0
; GFX10-NEXT: v_mov_b32_e32 v11, 0
@@ -2991,22 +3051,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: v_mov_b32_e32 v31, 0
; GFX10-NEXT: s_mov_b32 s37, return_72xi32 at abs32@hi
; GFX10-NEXT: s_mov_b32 s36, return_72xi32 at abs32@lo
-; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: v_writelane_b32 v63, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX10-NEXT: s_clause 0x28
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636
@@ -3189,31 +3233,46 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v62, s33 offset:1600 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-NEXT: s_mov_b32 s1, s0
-; GFX11-NEXT: s_mov_b32 s2, s0
-; GFX11-NEXT: s_mov_b32 s3, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_mov_b32 s39, s34
; GFX11-NEXT: s_mov_b32 s34, s32
; GFX11-NEXT: s_addk_i32 s32, 0xa00
; GFX11-NEXT: s_clause 0xd ; 56-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:52
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:48
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:44
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:40
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:36
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:32
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:28
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:24
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:20
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:16
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v59, s33 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v61, s33
+; GFX11-NEXT: v_writelane_b32 v62, s30, 0
+; GFX11-NEXT: v_writelane_b32 v62, s31, 1
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_add_i32 s0, s32, 0xa0
; GFX11-NEXT: s_add_i32 s1, s32, 0x90
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
@@ -3234,7 +3293,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: s_add_i32 s0, s32, 32
; GFX11-NEXT: s_add_i32 s1, s32, 16
; GFX11-NEXT: s_add_i32 s2, s33, 0x200
-; GFX11-NEXT: v_writelane_b32 v62, s30, 0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
@@ -3255,7 +3313,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
; GFX11-NEXT: s_mov_b32 s1, return_72xi32 at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, return_72xi32 at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v62, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_clause 0xb
; GFX11-NEXT: scratch_load_b128 v[43:46], off, s33 offset:624
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index e1f6906a89c29..d42904f29aa59 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -30,9 +30,9 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v6, s70, 18
; CHECK-NEXT: v_writelane_b32 v6, s71, 19
; CHECK-NEXT: v_writelane_b32 v6, s30, 20
+; CHECK-NEXT: v_writelane_b32 v6, s31, 21
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: v_writelane_b32 v6, s31, 21
; CHECK-NEXT: s_mov_b32 s68, 0
; CHECK-NEXT: s_mov_b32 s69, s4
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 0191592c393ce..7ba2f1b847fa0 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -28,16 +28,15 @@ define void @f0() {
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: v_writelane_b32 v4, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v4, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, f1 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, f1 at gotpcrel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v4, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v4, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v4, 0
; GFX11-NEXT: v_readlane_b32 s31, v4, 1
; GFX11-NEXT: s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index 1ece1dc7e6898..27b3d4ac4c10c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -15,8 +15,11 @@ define fastcc i32 @foo() #0 {
; CHECK-NEXT: $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17
- ; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
+ ; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
+ ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $pc_reg, $vgpr127, 0, 32, $vgpr127, 1, 32
; CHECK-NEXT: BUNDLE implicit-def $sgpr16_sgpr17, implicit-def $sgpr16, implicit-def $scc, implicit-def $sgpr17 {
; CHECK-NEXT: $sgpr16_sgpr17 = S_GETPC_B64
; CHECK-NEXT: $sgpr16 = S_ADD_U32 internal $sgpr16, target-flags(amdgpu-gotprel32-lo) @bar + 4, implicit-def $scc
@@ -26,8 +29,6 @@ define fastcc i32 @foo() #0 {
; CHECK-NEXT: BUFFER_GL1_INV implicit $exec
; CHECK-NEXT: BUFFER_GL0_INV implicit $exec
; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
- ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
- ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
; CHECK-NEXT: S_WAITCNT .Lgkmcnt_0
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo
diff --git a/llvm/test/CodeGen/AMDGPU/issue176578.ll b/llvm/test/CodeGen/AMDGPU/issue176578.ll
index 22c1307c779ee..35c53dfbec51b 100644
--- a/llvm/test/CodeGen/AMDGPU/issue176578.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue176578.ll
@@ -18,6 +18,8 @@ define <4 x i8> @issue176578() #0 {
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v41, s16, 15
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v41, s34, 0
; CHECK-NEXT: v_writelane_b32 v41, s35, 1
; CHECK-NEXT: v_writelane_b32 v41, s36, 2
@@ -31,8 +33,8 @@ define <4 x i8> @issue176578() #0 {
; CHECK-NEXT: v_writelane_b32 v41, s52, 10
; CHECK-NEXT: v_writelane_b32 v41, s53, 11
; CHECK-NEXT: v_writelane_b32 v41, s54, 12
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v41, s30, 13
+; CHECK-NEXT: v_writelane_b32 v41, s31, 14
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -43,8 +45,6 @@ define <4 x i8> @issue176578() #0 {
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_mov_b32 s54, 0
-; CHECK-NEXT: s_addk_i32 s32, 0x400
-; CHECK-NEXT: v_writelane_b32 v41, s31, 14
; CHECK-NEXT: s_branch .LBB0_2
; CHECK-NEXT: .LBB0_1: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 3a4684293ccc0..20f6f5021d5ef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -881,6 +881,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GISEL-NEXT: scratch_store_dword off, v24, s32 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GISEL-NEXT: v_writelane_b32 v24, s30, 0
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_writelane_b32 v24, s31, 1
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
@@ -889,13 +892,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_writelane_b32 v24, s30, 0
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_writelane_b32 v24, s31, 1
; GISEL-NEXT: v_readfirstlane_b32 s30, v0
; GISEL-NEXT: v_readfirstlane_b32 s31, v1
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 0a36d3dd28f06..f9332d752482d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -2334,20 +2334,20 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: v_writelane_b32 v34, s37, 1
; GISEL-NEXT: v_writelane_b32 v34, s38, 2
; GISEL-NEXT: v_writelane_b32 v34, s39, 3
+; GISEL-NEXT: v_writelane_b32 v34, s48, 4
+; GISEL-NEXT: v_writelane_b32 v34, s49, 5
+; GISEL-NEXT: v_writelane_b32 v34, s50, 6
+; GISEL-NEXT: v_writelane_b32 v34, s51, 7
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_writelane_b32 v34, s48, 4
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[12:13]
-; GISEL-NEXT: v_writelane_b32 v34, s49, 5
-; GISEL-NEXT: v_writelane_b32 v34, s50, 6
; GISEL-NEXT: s_mov_b32 s36, s28
; GISEL-NEXT: s_mov_b32 s37, s29
-; GISEL-NEXT: v_writelane_b32 v34, s51, 7
; GISEL-NEXT: v_mov_b32_e32 v16, v14
; GISEL-NEXT: v_mov_b32_e32 v17, v15
; GISEL-NEXT: v_readfirstlane_b32 s38, v0
@@ -2750,20 +2750,20 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; GISEL-NEXT: v_writelane_b32 v34, s37, 1
; GISEL-NEXT: v_writelane_b32 v34, s38, 2
; GISEL-NEXT: v_writelane_b32 v34, s39, 3
+; GISEL-NEXT: v_writelane_b32 v34, s48, 4
+; GISEL-NEXT: v_writelane_b32 v34, s49, 5
+; GISEL-NEXT: v_writelane_b32 v34, s50, 6
+; GISEL-NEXT: v_writelane_b32 v34, s51, 7
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
-; GISEL-NEXT: v_writelane_b32 v34, s48, 4
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT: v_writelane_b32 v34, s49, 5
-; GISEL-NEXT: v_writelane_b32 v34, s50, 6
; GISEL-NEXT: s_mov_b32 s36, s20
; GISEL-NEXT: s_mov_b32 s37, s21
-; GISEL-NEXT: v_writelane_b32 v34, s51, 7
; GISEL-NEXT: v_mov_b32_e32 v18, v0
; GISEL-NEXT: v_mov_b32_e32 v19, v1
; GISEL-NEXT: v_mov_b32_e32 v20, v2
@@ -4582,6 +4582,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
-attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" nounwind }
+attributes #1 = { "amdgpu-flat-work-group-size"="128,128" nounwind }
attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 79e9683737849..df20d43ee49fd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -global-isel -new-reg-bank-select -global-isel-abort=2 < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
-define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
+define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_i1:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27,7 +27,7 @@ define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
ret void
}
-define void @test_readfirstlane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) {
+define void @test_readfirstlane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_i1_inreg:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -50,7 +50,7 @@ define void @test_readfirstlane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) {
ret void
}
-define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %src1) {
+define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %src1) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_i1_select:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -83,7 +83,7 @@ define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %
ret void
}
-define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_i1_load:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -113,7 +113,7 @@ define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) {
+define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -136,7 +136,7 @@ define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) {
ret void
}
-define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) {
+define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -163,7 +163,7 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) {
ret void
}
-define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) {
+define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v2i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -192,7 +192,7 @@ define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) {
ret void
}
-define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) {
+define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v3i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -225,7 +225,7 @@ define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) {
ret void
}
-define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) {
+define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v4i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -262,7 +262,7 @@ define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) {
ret void
}
-define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) {
+define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v8i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -315,7 +315,7 @@ define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) {
ret void
}
-define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) {
+define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -342,7 +342,7 @@ define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_mov_b32 s0, 32
@@ -363,7 +363,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_mov_b64 s[0:1], 32
@@ -384,7 +384,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_mov_b32 s0, 0
@@ -407,7 +407,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -438,7 +438,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -472,7 +472,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -507,7 +507,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_m0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -545,7 +545,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -583,7 +583,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -623,7 +623,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -663,7 +663,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_fi:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s17
@@ -690,7 +690,7 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) {
ret void
}
-define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) {
+define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_half:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -713,7 +713,7 @@ define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) {
ret void
}
-define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) {
+define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_float:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -736,7 +736,7 @@ define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) {
ret void
}
-define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) {
+define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_bfloat:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -759,7 +759,7 @@ define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) {
ret void
}
-define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
+define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -783,7 +783,7 @@ define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
ret void
}
-define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) {
+define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v2f16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -806,7 +806,7 @@ define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) {
ret void
}
-define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) {
+define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v2f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -831,7 +831,7 @@ define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) {
ret void
}
-define void @test_readfirstlane_v3f32(ptr addrspace(1) %out, <3 x float> %src) {
+define void @test_readfirstlane_v3f32(ptr addrspace(1) %out, <3 x float> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v3f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -858,7 +858,7 @@ define void @test_readfirstlane_v3f32(ptr addrspace(1) %out, <3 x float> %src) {
ret void
}
-define void @test_readfirstlane_v4f32(ptr addrspace(1) %out, <4 x float> %src) {
+define void @test_readfirstlane_v4f32(ptr addrspace(1) %out, <4 x float> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v4f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -887,7 +887,7 @@ define void @test_readfirstlane_v4f32(ptr addrspace(1) %out, <4 x float> %src) {
ret void
}
-define void @test_readfirstlane_v8f32(ptr addrspace(1) %out, <8 x float> %src) {
+define void @test_readfirstlane_v8f32(ptr addrspace(1) %out, <8 x float> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v8f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -924,7 +924,7 @@ define void @test_readfirstlane_v8f32(ptr addrspace(1) %out, <8 x float> %src) {
ret void
}
-define void @test_readfirstlane_v16f32(ptr addrspace(1) %out, <16 x float> %src) {
+define void @test_readfirstlane_v16f32(ptr addrspace(1) %out, <16 x float> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v16f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -977,17 +977,13 @@ define void @test_readfirstlane_v16f32(ptr addrspace(1) %out, <16 x float> %src)
ret void
}
-define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) {
+define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v32f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27
-; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
-; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2
@@ -1004,6 +1000,10 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27
+; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
+; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32
; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30
; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21
; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20
@@ -1070,10 +1070,6 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2
-; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3
@@ -1089,6 +1085,10 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2
+; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5
@@ -1152,7 +1152,7 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
ret void
}
-define void @test_readfirstlane_v2i32(ptr addrspace(1) %out, <2 x i32> %src) {
+define void @test_readfirstlane_v2i32(ptr addrspace(1) %out, <2 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v2i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1177,7 +1177,7 @@ define void @test_readfirstlane_v2i32(ptr addrspace(1) %out, <2 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v3i32(ptr addrspace(1) %out, <3 x i32> %src) {
+define void @test_readfirstlane_v3i32(ptr addrspace(1) %out, <3 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v3i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1204,7 +1204,7 @@ define void @test_readfirstlane_v3i32(ptr addrspace(1) %out, <3 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v4i32(ptr addrspace(1) %out, <4 x i32> %src) {
+define void @test_readfirstlane_v4i32(ptr addrspace(1) %out, <4 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v4i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1233,7 +1233,7 @@ define void @test_readfirstlane_v4i32(ptr addrspace(1) %out, <4 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v5i32(ptr addrspace(1) %out, <5 x i32> %src) {
+define void @test_readfirstlane_v5i32(ptr addrspace(1) %out, <5 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v5i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1264,7 +1264,7 @@ define void @test_readfirstlane_v5i32(ptr addrspace(1) %out, <5 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v6i32(ptr addrspace(1) %out, <6 x i32> %src) {
+define void @test_readfirstlane_v6i32(ptr addrspace(1) %out, <6 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v6i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1297,7 +1297,7 @@ define void @test_readfirstlane_v6i32(ptr addrspace(1) %out, <6 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
+define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v7i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1332,7 +1332,7 @@ define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v8i32(ptr addrspace(1) %out, <8 x i32> %src) {
+define void @test_readfirstlane_v8i32(ptr addrspace(1) %out, <8 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v8i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1369,7 +1369,7 @@ define void @test_readfirstlane_v8i32(ptr addrspace(1) %out, <8 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v16i32(ptr addrspace(1) %out, <16 x i32> %src) {
+define void @test_readfirstlane_v16i32(ptr addrspace(1) %out, <16 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v16i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1422,17 +1422,13 @@ define void @test_readfirstlane_v16i32(ptr addrspace(1) %out, <16 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
+define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v32i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27
-; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
-; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2
@@ -1449,6 +1445,10 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14
; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27
+; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
+; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32
; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30
; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21
; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20
@@ -1515,10 +1515,6 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2
-; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3
@@ -1534,6 +1530,10 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14
; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2
+; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5
@@ -1597,7 +1597,7 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
ret void
}
-define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
+define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1626,7 +1626,7 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
ret void
}
-define void @test_readfirstlane_v16i16(ptr addrspace(1) %out, <16 x i16> %src) {
+define void @test_readfirstlane_v16i16(ptr addrspace(1) %out, <16 x i16> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v16i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1663,7 +1663,7 @@ define void @test_readfirstlane_v16i16(ptr addrspace(1) %out, <16 x i16> %src) {
ret void
}
-define void @test_readfirstlane_v32i16(ptr addrspace(1) %out, <32 x i16> %src) {
+define void @test_readfirstlane_v32i16(ptr addrspace(1) %out, <32 x i16> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v32i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1717,7 +1717,7 @@ define void @test_readfirstlane_v32i16(ptr addrspace(1) %out, <32 x i16> %src) {
}
-define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src) {
+define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_v32f16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1770,3 +1770,4 @@ define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src)
ret void
}
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 6132baceb1ce3..6b287385414e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -120,7 +120,7 @@ bb:
ret void
}
-define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x64_f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -146,7 +146,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -172,7 +172,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -198,7 +198,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -423,7 +423,7 @@ bb:
ret void
}
-define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -508,7 +508,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -593,7 +593,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -678,7 +678,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,14 +826,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
+; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
-; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
@@ -943,7 +943,7 @@ bb:
ret void
}
-define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -969,7 +969,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -995,7 +995,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1021,7 +1021,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1150,7 +1150,7 @@ bb:
ret void
}
-define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1200,7 +1200,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1250,7 +1250,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1300,7 +1300,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1527,7 +1527,7 @@ bb:
ret void
}
-define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
+define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_i32_16x16x128_i8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1553,7 +1553,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1,
ret <4 x i32> %result
}
-define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
+define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1579,7 +1579,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32
ret <4 x i32> %result
}
-define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
+define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1605,7 +1605,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32
ret <4 x i32> %result
}
-define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) {
+define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1842,7 +1842,7 @@ bb:
ret void
}
-define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
+define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1927,7 +1927,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
ret <16 x i32> %result
}
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2012,7 +2012,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
ret <16 x i32> %result
}
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2097,7 +2097,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
ret <16 x i32> %result
}
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) {
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2245,14 +2245,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
+; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
-; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
@@ -2426,7 +2426,7 @@ bb:
ret void
}
-define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2452,7 +2452,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32>
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2478,7 +2478,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2504,7 +2504,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2717,7 +2717,7 @@ bb:
ret void
}
-define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2743,7 +2743,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32>
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2769,7 +2769,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2795,7 +2795,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3008,7 +3008,7 @@ bb:
ret void
}
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3034,7 +3034,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32>
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3060,7 +3060,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3086,7 +3086,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3299,7 +3299,7 @@ bb:
ret void
}
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3325,7 +3325,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32>
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3351,7 +3351,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) #1 {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3377,7 +3377,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <
ret <4 x float> %result
}
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3614,7 +3614,7 @@ bb:
ret void
}
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3699,7 +3699,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3784,7 +3784,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3869,7 +3869,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4017,14 +4017,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
+; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
-; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
@@ -4222,7 +4222,7 @@ bb:
ret void
}
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4307,7 +4307,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4392,7 +4392,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4477,7 +4477,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4625,14 +4625,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
+; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
-; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
@@ -4830,7 +4830,7 @@ bb:
ret void
}
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4915,7 +4915,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5000,7 +5000,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5085,7 +5085,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5233,14 +5233,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
+; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
-; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
@@ -5438,7 +5438,7 @@ bb:
ret void
}
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5523,7 +5523,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5608,7 +5608,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5693,7 +5693,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
ret <16 x float> %result
}
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) #1 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5841,14 +5841,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
-; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
+; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
-; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
@@ -5895,4 +5895,5 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
ret <16 x float> %result
}
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" nounwind }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index a98f22fdf72f2..a0e2a8d017980 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -8,7 +8,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GFX1170 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
-define double @v_maximum_f64(double %src0, double %src1) {
+define double @v_maximum_f64(double %src0, double %src1) #0 {
; GFX7-LABEL: v_maximum_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -88,7 +88,7 @@ define double @v_maximum_f64(double %src0, double %src1) {
ret double %op
}
-define double @v_maximum_f64__nnan(double %src0, double %src1) {
+define double @v_maximum_f64__nnan(double %src0, double %src1) #0 {
; GFX7-LABEL: v_maximum_f64__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -138,7 +138,7 @@ define double @v_maximum_f64__nnan(double %src0, double %src1) {
ret double %op
}
-define double @v_maximum_f64__nsz(double %src0, double %src1) {
+define double @v_maximum_f64__nsz(double %src0, double %src1) #0 {
; GFX7-LABEL: v_maximum_f64__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -218,7 +218,7 @@ define double @v_maximum_f64__nsz(double %src0, double %src1) {
ret double %op
}
-define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) {
+define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) #0 {
; GFX7-LABEL: v_maximum_f64__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -268,7 +268,7 @@ define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) {
ret double %op
}
-define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) {
+define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) #0 {
; GFX7-LABEL: v_maximum_f64__nnan_src0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -360,7 +360,7 @@ define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) {
ret double %op
}
-define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) {
+define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) #0 {
; GFX7-LABEL: v_maximum_f64__nnan_src1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -452,7 +452,7 @@ define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) {
ret double %op
}
-define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
+define void @s_maximum_f64(double inreg %src0, double inreg %src1) #0 {
; GFX7-LABEL: s_maximum_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -564,7 +564,7 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
ret void
}
-define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) {
+define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v2f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -671,7 +671,7 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) {
ret <2 x double> %op
}
-define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src1) {
+define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v2f64__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -728,7 +728,7 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
ret <2 x double> %op
}
-define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1) {
+define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v2f64__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -835,7 +835,7 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
ret <2 x double> %op
}
-define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> %src1) {
+define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v2f64__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -892,7 +892,7 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
ret <2 x double> %op
}
-define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) {
+define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) #0 {
; GFX7-LABEL: s_maximum_v2f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1037,7 +1037,7 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
ret void
}
-define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) {
+define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v3f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1170,7 +1170,7 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) {
ret <3 x double> %op
}
-define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src1) {
+define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v3f64__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1234,7 +1234,7 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
ret <3 x double> %op
}
-define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1) {
+define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v3f64__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1367,7 +1367,7 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
ret <3 x double> %op
}
-define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> %src1) {
+define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v3f64__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1431,7 +1431,7 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
ret <3 x double> %op
}
-define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) {
+define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v4f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1591,7 +1591,7 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) {
ret <4 x double> %op
}
-define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src1) {
+define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v4f64__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1662,7 +1662,7 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
ret <4 x double> %op
}
-define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1) {
+define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v4f64__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1822,7 +1822,7 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
ret <4 x double> %op
}
-define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> %src1) {
+define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v4f64__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1893,7 +1893,7 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
ret <4 x double> %op
}
-define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) {
+define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v8f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2174,7 +2174,7 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) {
ret <8 x double> %op
}
-define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) {
+define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) #0 {
; GFX7-LABEL: v_maximum_v16f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2545,6 +2545,12 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:8
; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:4
; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:16
@@ -2572,12 +2578,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104
; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse
; GFX950-NEXT: s_waitcnt vmcnt(25)
; GFX950-NEXT: v_max_f64 v[58:59], v[0:1], v[34:35]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[34:35]
@@ -3077,3 +3077,5 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
%op = call <16 x double> @llvm.maximum.v16f64(<16 x double> %src0, <16 x double> %src1)
ret <16 x double> %op
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 1d7678779b8be..1d2392f74847d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -8,7 +8,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GFX1170 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
-define double @v_minimum_f64(double %src0, double %src1) {
+define double @v_minimum_f64(double %src0, double %src1) #0 {
; GFX7-LABEL: v_minimum_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -88,7 +88,7 @@ define double @v_minimum_f64(double %src0, double %src1) {
ret double %op
}
-define double @v_minimum_f64__nnan(double %src0, double %src1) {
+define double @v_minimum_f64__nnan(double %src0, double %src1) #0 {
; GFX7-LABEL: v_minimum_f64__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -138,7 +138,7 @@ define double @v_minimum_f64__nnan(double %src0, double %src1) {
ret double %op
}
-define double @v_minimum_f64__nsz(double %src0, double %src1) {
+define double @v_minimum_f64__nsz(double %src0, double %src1) #0 {
; GFX7-LABEL: v_minimum_f64__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -218,7 +218,7 @@ define double @v_minimum_f64__nsz(double %src0, double %src1) {
ret double %op
}
-define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) {
+define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) #0 {
; GFX7-LABEL: v_minimum_f64__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -268,7 +268,7 @@ define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) {
ret double %op
}
-define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) {
+define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) #0 {
; GFX7-LABEL: v_minimum_f64__nnan_src0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -360,7 +360,7 @@ define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) {
ret double %op
}
-define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) {
+define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) #0 {
; GFX7-LABEL: v_minimum_f64__nnan_src1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -452,7 +452,7 @@ define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) {
ret double %op
}
-define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
+define void @s_minimum_f64(double inreg %src0, double inreg %src1) #0 {
; GFX7-LABEL: s_minimum_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -564,7 +564,7 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
ret void
}
-define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) {
+define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v2f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -671,7 +671,7 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) {
ret <2 x double> %op
}
-define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src1) {
+define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v2f64__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -728,7 +728,7 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
ret <2 x double> %op
}
-define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1) {
+define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v2f64__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -835,7 +835,7 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
ret <2 x double> %op
}
-define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> %src1) {
+define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v2f64__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -892,7 +892,7 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
ret <2 x double> %op
}
-define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) {
+define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) #0 {
; GFX7-LABEL: s_minimum_v2f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1037,7 +1037,7 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
ret void
}
-define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) {
+define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v3f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1170,7 +1170,7 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) {
ret <3 x double> %op
}
-define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src1) {
+define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v3f64__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1234,7 +1234,7 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
ret <3 x double> %op
}
-define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1) {
+define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v3f64__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1367,7 +1367,7 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
ret <3 x double> %op
}
-define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> %src1) {
+define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v3f64__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1431,7 +1431,7 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
ret <3 x double> %op
}
-define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) {
+define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v4f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1591,7 +1591,7 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) {
ret <4 x double> %op
}
-define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src1) {
+define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v4f64__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1662,7 +1662,7 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
ret <4 x double> %op
}
-define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1) {
+define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v4f64__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1822,7 +1822,7 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
ret <4 x double> %op
}
-define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> %src1) {
+define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v4f64__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1893,7 +1893,7 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
ret <4 x double> %op
}
-define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) {
+define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v8f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2174,7 +2174,7 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) {
ret <8 x double> %op
}
-define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) {
+define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) #0 {
; GFX7-LABEL: v_minimum_v16f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2545,6 +2545,12 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:8
; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:4
; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:16
@@ -2572,12 +2578,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX950-NEXT: scratch_load_dword v31, off, s32
; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104
; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse
; GFX950-NEXT: s_waitcnt vmcnt(25)
; GFX950-NEXT: v_min_f64 v[58:59], v[0:1], v[34:35]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[34:35]
@@ -3077,3 +3077,5 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
%op = call <16 x double> @llvm.minimum.v16f64(<16 x double> %src0, <16 x double> %src1)
ret <16 x double> %op
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 7c5e406bd07cc..31dc0dc9e66e2 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -17,8 +17,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
@@ -46,8 +46,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
@@ -74,8 +74,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: v_writelane_b32 v1, s55, 0
+; GFX11-NEXT: s_add_i32 s0, s32, 64
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: s_addc_u32 s0, s32, 0x4040
@@ -108,9 +109,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v1, s55, 0
+; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: s_add_co_ci_u32 s0, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -139,9 +139,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: v_writelane_b32 v1, s55, 0
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
-; GFX8-NEXT: v_writelane_b32 v1, s55, 0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
@@ -168,6 +168,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: v_writelane_b32 v1, s55, 0
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
@@ -175,7 +176,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0
-; GFX900-NEXT: v_writelane_b32 v1, s55, 0
; GFX900-NEXT: v_readfirstlane_b32 s55, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
; GFX900-NEXT: ;;#ASMSTART
@@ -196,13 +196,13 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
; GFX942-NEXT: scratch_store_dword off, v1, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
+; GFX942-NEXT: v_writelane_b32 v1, s55, 0
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: s_addc_u32 s0, s32, 0x4040
; GFX942-NEXT: s_bitcmp1_b32 s0, 0
; GFX942-NEXT: s_bitset0_b32 s0, 0
-; GFX942-NEXT: v_writelane_b32 v1, s55, 0
; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
@@ -397,10 +397,10 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
; GFX942-NEXT: scratch_store_dword off, v1, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
+; GFX942-NEXT: v_writelane_b32 v1, s55, 0
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: s_add_i32 s0, s32, 0x4040
-; GFX942-NEXT: v_writelane_b32 v1, s55, 0
; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
@@ -433,9 +433,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: s_mov_b32 s32, s33
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
@@ -467,9 +467,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880
; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: s_mov_b32 s32, s33
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
@@ -501,8 +501,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_addk_i32 s32, 0x4080
-; GFX11-NEXT: s_add_i32 s0, s33, 64
; GFX11-NEXT: v_writelane_b32 v1, s55, 0
+; GFX11-NEXT: s_add_i32 s0, s33, 64
+; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: s_addc_u32 s0, s33, 0x4040
@@ -511,7 +512,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
; GFX11-NEXT: s_bitset0_b32 s0, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s55, s0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s55, scc
@@ -539,8 +540,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
-; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v1, s55, 0
+; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: s_add_co_ci_u32 s0, s33, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s33
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -574,16 +575,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX8-NEXT: s_add_i32 s7, s33, 0x101100
; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_add_i32 s32, s32, 0x102000
+; GFX8-NEXT: v_writelane_b32 v1, s55, 0
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
-; GFX8-NEXT: v_writelane_b32 v1, s55, 0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX8-NEXT: s_movk_i32 s55, 0x4040
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s55, v0
-; GFX8-NEXT: s_add_i32 s32, s32, 0x102000
; GFX8-NEXT: v_readfirstlane_b32 s55, v0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
; GFX8-NEXT: ;;#ASMSTART
@@ -608,6 +609,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX900-NEXT: s_add_i32 s7, s33, 0x101100
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_add_i32 s32, s32, 0x102000
+; GFX900-NEXT: v_writelane_b32 v1, s55, 0
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
@@ -615,8 +618,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0
-; GFX900-NEXT: s_add_i32 s32, s32, 0x102000
-; GFX900-NEXT: v_writelane_b32 v1, s55, 0
; GFX900-NEXT: v_readfirstlane_b32 s55, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
; GFX900-NEXT: ;;#ASMSTART
@@ -642,13 +643,13 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX942-NEXT: scratch_store_dword off, v1, s3 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_addk_i32 s32, 0x4080
+; GFX942-NEXT: v_writelane_b32 v1, s55, 0
; GFX942-NEXT: s_add_i32 s0, s33, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: s_addc_u32 s0, s33, 0x4040
; GFX942-NEXT: s_bitcmp1_b32 s0, 0
; GFX942-NEXT: s_bitset0_b32 s0, 0
-; GFX942-NEXT: v_writelane_b32 v1, s55, 0
; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
@@ -681,8 +682,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32
; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1
; GFX10_1-NEXT: v_readfirstlane_b32 s55, v1
@@ -705,8 +706,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800
; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32
; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1
; GFX10_3-NEXT: v_readfirstlane_b32 s55, v1
@@ -728,13 +729,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX11-NEXT: s_add_i32 s1, s32, 0x4040
; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: v_writelane_b32 v0, s55, 0
+; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: s_addc_u32 s0, s32, 64
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
; GFX11-NEXT: s_bitset0_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s55, s0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s55, scc
@@ -804,9 +804,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX900-NEXT: s_add_i32 s6, s32, 0x101000
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: v_writelane_b32 v0, s55, 0
; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s32
; GFX900-NEXT: v_add_u32_e32 v1, 64, v1
-; GFX900-NEXT: v_writelane_b32 v0, s55, 0
; GFX900-NEXT: v_readfirstlane_b32 s55, v1
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
; GFX900-NEXT: ;;#ASMSTART
@@ -827,11 +827,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX942-NEXT: s_add_i32 s2, s32, 0x4040
; GFX942-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
+; GFX942-NEXT: v_writelane_b32 v0, s55, 0
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: s_addc_u32 s0, s32, 64
; GFX942-NEXT: s_bitcmp1_b32 s0, 0
; GFX942-NEXT: s_bitset0_b32 s0, 0
-; GFX942-NEXT: v_writelane_b32 v0, s55, 0
; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s55, scc
@@ -989,8 +989,8 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
; GFX942-NEXT: s_add_i32 s2, s32, 0x4040
; GFX942-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_writelane_b32 v0, s55, 0
+; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s55
@@ -1018,9 +1018,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s33
-; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s33
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: s_mov_b32 s32, s33
; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1
@@ -1047,9 +1047,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800
; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s33
-; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s33
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: s_mov_b32 s32, s33
; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1
@@ -1076,8 +1076,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_addk_i32 s32, 0x4080
-; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: v_writelane_b32 v0, s55, 0
+; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: s_addc_u32 s0, s33, 64
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
@@ -1109,13 +1109,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v0, s55, 0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT: v_writelane_b32 v0, s55, 0
; GFX12-NEXT: s_mov_b32 s55, s33
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s55, scc
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s55, v0, 0
; GFX12-NEXT: s_mov_b32 s32, s33
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
@@ -1136,11 +1137,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX8-NEXT: s_add_i32 s7, s33, 0x101000
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_add_i32 s32, s32, 0x102000
; GFX8-NEXT: v_writelane_b32 v0, s55, 0
; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s33
; GFX8-NEXT: s_mov_b32 s55, 64
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s55, v1
-; GFX8-NEXT: s_add_i32 s32, s32, 0x102000
; GFX8-NEXT: v_readfirstlane_b32 s55, v1
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
; GFX8-NEXT: ;;#ASMSTART
@@ -1165,10 +1166,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX900-NEXT: s_add_i32 s7, s33, 0x101000
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s33
-; GFX900-NEXT: v_add_u32_e32 v1, 64, v1
; GFX900-NEXT: s_add_i32 s32, s32, 0x102000
; GFX900-NEXT: v_writelane_b32 v0, s55, 0
+; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s33
+; GFX900-NEXT: v_add_u32_e32 v1, 64, v1
; GFX900-NEXT: v_readfirstlane_b32 s55, v1
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
; GFX900-NEXT: ;;#ASMSTART
@@ -1194,11 +1195,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX942-NEXT: scratch_store_dword off, v0, s3 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_addk_i32 s32, 0x4080
+; GFX942-NEXT: v_writelane_b32 v0, s55, 0
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: s_addc_u32 s0, s33, 64
; GFX942-NEXT: s_bitcmp1_b32 s0, 0
; GFX942-NEXT: s_bitset0_b32 s0, 0
-; GFX942-NEXT: v_writelane_b32 v0, s55, 0
; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s55, scc
@@ -1228,8 +1229,8 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s5
-; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
; GFX10_1-NEXT: s_lshr_b32 s55, s33, 5
; GFX10_1-NEXT: s_mov_b32 s32, s33
; GFX10_1-NEXT: s_add_i32 s55, s55, 64
@@ -1255,8 +1256,8 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800
; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s5
-; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
; GFX10_3-NEXT: s_lshr_b32 s55, s33, 5
; GFX10_3-NEXT: s_mov_b32 s32, s33
; GFX10_3-NEXT: s_add_i32 s55, s55, 64
@@ -1281,8 +1282,8 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX11-NEXT: s_add_i32 s2, s33, 0x4040
; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_writelane_b32 v0, s55, 0
; GFX11-NEXT: s_addk_i32 s32, 0x4080
+; GFX11-NEXT: v_writelane_b32 v0, s55, 0
; GFX11-NEXT: s_add_i32 s1, s33, 64
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_mov_b32 s55, s1
@@ -1311,14 +1312,15 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_writelane_b32 v0, s55, 0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT: v_writelane_b32 v0, s55, 0
; GFX12-NEXT: s_mov_b32 s55, s33
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s55
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_mov_b32 s32, s33
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s55, v0, 0
+; GFX12-NEXT: s_mov_b32 s32, s33
; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -1390,8 +1392,8 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX942-NEXT: scratch_store_dword off, v0, s1 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_addk_i32 s32, 0x4080
-; GFX942-NEXT: s_add_i32 s1, s33, 64
; GFX942-NEXT: v_writelane_b32 v0, s55, 0
+; GFX942-NEXT: s_add_i32 s1, s33, 64
; GFX942-NEXT: s_mov_b32 s55, s1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s55
@@ -1529,8 +1531,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_lshr_b32 s4, s32, 6
; GFX8-NEXT: v_writelane_b32 v1, s55, 0
+; GFX8-NEXT: s_lshr_b32 s4, s32, 6
; GFX8-NEXT: s_add_i32 s55, s4, 0x442c
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
@@ -1556,8 +1558,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: s_lshr_b32 s4, s32, 6
; GFX900-NEXT: v_writelane_b32 v1, s55, 0
+; GFX900-NEXT: s_lshr_b32 s4, s32, 6
; GFX900-NEXT: s_add_i32 s55, s4, 0x442c
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
@@ -1763,9 +1765,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: v_writelane_b32 v1, s55, 0
; GFX900-NEXT: s_lshr_b32 s4, s32, 6
; GFX900-NEXT: s_addk_i32 s4, 0x4040
-; GFX900-NEXT: v_writelane_b32 v1, s55, 0
; GFX900-NEXT: s_lshl2_add_u32 s55, s16, s4
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
@@ -1791,8 +1793,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX942-NEXT: s_add_i32 s1, s32, 0x8040
; GFX942-NEXT: scratch_store_dword off, v1, s1 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
-; GFX942-NEXT: s_add_i32 s1, s32, 0x4040
; GFX942-NEXT: v_writelane_b32 v1, s55, 0
+; GFX942-NEXT: s_add_i32 s1, s32, 0x4040
; GFX942-NEXT: s_lshl2_add_u32 s55, s0, s1
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 68c0d78485517..edf020cce0fcc 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -52,11 +52,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX7-NEXT: v_writelane_b32 v23, s53, 12
; GFX7-NEXT: v_writelane_b32 v23, s54, 13
; GFX7-NEXT: v_writelane_b32 v23, s55, 14
-; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
; GFX7-NEXT: v_writelane_b32 v23, s30, 15
+; GFX7-NEXT: v_writelane_b32 v23, s31, 16
+; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_writelane_b32 v23, s31, 16
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use alloca0 v0
; GFX7-NEXT: ;;#ASMEND
@@ -119,11 +119,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX8-NEXT: v_writelane_b32 v23, s53, 12
; GFX8-NEXT: v_writelane_b32 v23, s54, 13
; GFX8-NEXT: v_writelane_b32 v23, s55, 14
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: v_writelane_b32 v23, s30, 15
+; GFX8-NEXT: v_writelane_b32 v23, s31, 16
+; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v23, s31, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
@@ -187,11 +187,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX900-NEXT: v_writelane_b32 v23, s53, 12
; GFX900-NEXT: v_writelane_b32 v23, s54, 13
; GFX900-NEXT: v_writelane_b32 v23, s55, 14
-; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_writelane_b32 v23, s30, 15
+; GFX900-NEXT: v_writelane_b32 v23, s31, 16
+; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v23, s31, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
@@ -254,11 +254,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX942-NEXT: v_writelane_b32 v23, s53, 12
; GFX942-NEXT: v_writelane_b32 v23, s54, 13
; GFX942-NEXT: v_writelane_b32 v23, s55, 14
-; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_writelane_b32 v23, s30, 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v23, s31, 16
+; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT: v_writelane_b32 v23, s31, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
@@ -306,13 +307,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
; GFX10_1-NEXT: v_writelane_b32 v23, s33, 0
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: v_writelane_b32 v23, s34, 1
-; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use alloca0 v0
-; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_writelane_b32 v23, s35, 2
; GFX10_1-NEXT: v_writelane_b32 v23, s36, 3
; GFX10_1-NEXT: v_writelane_b32 v23, s37, 4
@@ -328,6 +323,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_1-NEXT: v_writelane_b32 v23, s55, 14
; GFX10_1-NEXT: v_writelane_b32 v23, s30, 15
; GFX10_1-NEXT: v_writelane_b32 v23, s31, 16
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT: ;;#ASMSTART
+; GFX10_1-NEXT: ; use alloca0 v0
+; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX10_1-NEXT: ;;#ASMEND
@@ -371,13 +372,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
; GFX10_3-NEXT: v_writelane_b32 v23, s33, 0
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: v_writelane_b32 v23, s34, 1
-; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use alloca0 v0
-; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_writelane_b32 v23, s35, 2
; GFX10_3-NEXT: v_writelane_b32 v23, s36, 3
; GFX10_3-NEXT: v_writelane_b32 v23, s37, 4
@@ -393,6 +388,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_3-NEXT: v_writelane_b32 v23, s55, 14
; GFX10_3-NEXT: v_writelane_b32 v23, s30, 15
; GFX10_3-NEXT: v_writelane_b32 v23, s31, 16
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT: ;;#ASMSTART
+; GFX10_3-NEXT: ; use alloca0 v0
+; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX10_3-NEXT: ;;#ASMEND
@@ -435,14 +436,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v23, s33, 0
-; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: v_writelane_b32 v23, s34, 1
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use alloca0 v0
-; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: v_writelane_b32 v23, s35, 2
; GFX11-NEXT: v_writelane_b32 v23, s36, 3
; GFX11-NEXT: v_writelane_b32 v23, s37, 4
@@ -458,14 +452,21 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX11-NEXT: v_writelane_b32 v23, s55, 14
; GFX11-NEXT: v_writelane_b32 v23, s30, 15
; GFX11-NEXT: v_writelane_b32 v23, s31, 16
+; GFX11-NEXT: s_add_i32 s0, s32, 64
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use alloca0 v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_addc_u32 s59, s32, 0x4040
; GFX11-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_bitcmp1_b32 s59, 0
; GFX11-NEXT: s_bitset0_b32 s59, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s54, s59
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
@@ -506,11 +507,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v23, s33, 0
-; GFX12-NEXT: v_mov_b32_e32 v0, s32
-; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use alloca0 v0
-; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: v_writelane_b32 v23, s34, 1
; GFX12-NEXT: v_writelane_b32 v23, s35, 2
; GFX12-NEXT: v_writelane_b32 v23, s36, 3
@@ -527,6 +523,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: v_writelane_b32 v23, s55, 14
; GFX12-NEXT: v_writelane_b32 v23, s30, 15
; GFX12-NEXT: v_writelane_b32 v23, s31, 16
+; GFX12-NEXT: v_mov_b32_e32 v0, s32
+; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use alloca0 v0
+; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX12-NEXT: ;;#ASMEND
@@ -629,8 +630,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX7-NEXT: v_writelane_b32 v21, s54, 13
; GFX7-NEXT: v_writelane_b32 v21, s55, 14
; GFX7-NEXT: v_writelane_b32 v21, s30, 15
-; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
; GFX7-NEXT: v_writelane_b32 v21, s31, 16
+; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX7-NEXT: ;;#ASMEND
@@ -687,8 +688,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX8-NEXT: v_writelane_b32 v21, s54, 13
; GFX8-NEXT: v_writelane_b32 v21, s55, 14
; GFX8-NEXT: v_writelane_b32 v21, s30, 15
-; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
; GFX8-NEXT: v_writelane_b32 v21, s31, 16
+; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX8-NEXT: ;;#ASMEND
@@ -746,8 +747,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX900-NEXT: v_writelane_b32 v21, s54, 13
; GFX900-NEXT: v_writelane_b32 v21, s55, 14
; GFX900-NEXT: v_writelane_b32 v21, s30, 15
-; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
; GFX900-NEXT: v_writelane_b32 v21, s31, 16
+; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX900-NEXT: ;;#ASMEND
@@ -805,9 +806,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX942-NEXT: v_writelane_b32 v21, s54, 13
; GFX942-NEXT: v_writelane_b32 v21, s55, 14
; GFX942-NEXT: v_writelane_b32 v21, s30, 15
-; GFX942-NEXT: s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v21, s31, 16
+; GFX942-NEXT: s_and_b64 s[60:61], 0, exec
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX942-NEXT: ;;#ASMEND
@@ -852,7 +853,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
; GFX10_1-NEXT: v_writelane_b32 v21, s33, 0
-; GFX10_1-NEXT: s_and_b32 s59, 0, exec_lo
; GFX10_1-NEXT: v_writelane_b32 v21, s34, 1
; GFX10_1-NEXT: v_writelane_b32 v21, s35, 2
; GFX10_1-NEXT: v_writelane_b32 v21, s36, 3
@@ -874,6 +874,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_lshrrev_b32_e64 v22, 5, s32
; GFX10_1-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
+; GFX10_1-NEXT: s_and_b32 s59, 0, exec_lo
; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 16, v22
; GFX10_1-NEXT: v_readfirstlane_b32 s54, v22
; GFX10_1-NEXT: ;;#ASMSTART
@@ -912,7 +913,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_3-NEXT: buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
; GFX10_3-NEXT: v_writelane_b32 v21, s33, 0
-; GFX10_3-NEXT: s_and_b32 s59, 0, exec_lo
; GFX10_3-NEXT: v_writelane_b32 v21, s34, 1
; GFX10_3-NEXT: v_writelane_b32 v21, s35, 2
; GFX10_3-NEXT: v_writelane_b32 v21, s36, 3
@@ -934,6 +934,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_lshrrev_b32_e64 v22, 5, s32
; GFX10_3-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
+; GFX10_3-NEXT: s_and_b32 s59, 0, exec_lo
; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 16, v22
; GFX10_3-NEXT: v_readfirstlane_b32 s54, v22
; GFX10_3-NEXT: ;;#ASMSTART
@@ -971,7 +972,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX11-NEXT: scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v21, s33, 0
-; GFX11-NEXT: s_and_b32 s59, 0, exec_lo
; GFX11-NEXT: v_writelane_b32 v21, s34, 1
; GFX11-NEXT: v_writelane_b32 v21, s35, 2
; GFX11-NEXT: v_writelane_b32 v21, s36, 3
@@ -988,6 +988,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX11-NEXT: v_writelane_b32 v21, s55, 14
; GFX11-NEXT: v_writelane_b32 v21, s30, 15
; GFX11-NEXT: v_writelane_b32 v21, s31, 16
+; GFX11-NEXT: s_and_b32 s59, 0, exec_lo
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX11-NEXT: ;;#ASMEND
@@ -1036,7 +1037,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v21, s33, 0
-; GFX12-NEXT: s_and_b32 s59, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v21, s34, 1
; GFX12-NEXT: v_writelane_b32 v21, s35, 2
; GFX12-NEXT: v_writelane_b32 v21, s36, 3
@@ -1057,11 +1057,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
+; GFX12-NEXT: s_and_b32 s59, 0, exec_lo
; GFX12-NEXT: s_mov_b32 s54, s32
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s30, v21, 15
; GFX12-NEXT: v_readlane_b32 s31, v21, 16
; GFX12-NEXT: v_readlane_b32 s55, v21, 14
@@ -1150,16 +1150,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: v_writelane_b32 v23, s52, 11
; GFX7-NEXT: v_writelane_b32 v23, s53, 12
; GFX7-NEXT: v_writelane_b32 v23, s54, 13
-; GFX7-NEXT: s_lshr_b32 s5, s32, 6
; GFX7-NEXT: v_writelane_b32 v23, s55, 14
+; GFX7-NEXT: v_writelane_b32 v23, s30, 15
+; GFX7-NEXT: v_writelane_b32 v23, s31, 16
+; GFX7-NEXT: s_lshr_b32 s5, s32, 6
; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
; GFX7-NEXT: s_add_i32 s4, s5, 0x4240
; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT: v_writelane_b32 v23, s30, 15
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0
; GFX7-NEXT: v_writelane_b32 v22, s4, 0
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_writelane_b32 v23, s31, 16
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use alloca0 v0
; GFX7-NEXT: ;;#ASMEND
@@ -1221,16 +1221,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: v_writelane_b32 v23, s52, 11
; GFX8-NEXT: v_writelane_b32 v23, s53, 12
; GFX8-NEXT: v_writelane_b32 v23, s54, 13
-; GFX8-NEXT: s_lshr_b32 s5, s32, 6
; GFX8-NEXT: v_writelane_b32 v23, s55, 14
+; GFX8-NEXT: v_writelane_b32 v23, s30, 15
+; GFX8-NEXT: v_writelane_b32 v23, s31, 16
+; GFX8-NEXT: s_lshr_b32 s5, s32, 6
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: s_add_i32 s4, s5, 0x4240
; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT: v_writelane_b32 v23, s30, 15
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: v_writelane_b32 v22, s4, 0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v23, s31, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
@@ -1291,16 +1291,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX900-NEXT: v_writelane_b32 v23, s52, 11
; GFX900-NEXT: v_writelane_b32 v23, s53, 12
; GFX900-NEXT: v_writelane_b32 v23, s54, 13
-; GFX900-NEXT: s_lshr_b32 s5, s32, 6
; GFX900-NEXT: v_writelane_b32 v23, s55, 14
+; GFX900-NEXT: v_writelane_b32 v23, s30, 15
+; GFX900-NEXT: v_writelane_b32 v23, s31, 16
+; GFX900-NEXT: s_lshr_b32 s5, s32, 6
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: s_add_i32 s4, s5, 0x4240
; GFX900-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX900-NEXT: v_writelane_b32 v23, s30, 15
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: v_writelane_b32 v22, s4, 0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v23, s31, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
@@ -1360,14 +1360,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX942-NEXT: v_writelane_b32 v22, s53, 12
; GFX942-NEXT: v_writelane_b32 v22, s54, 13
; GFX942-NEXT: v_writelane_b32 v22, s55, 14
-; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_writelane_b32 v22, s30, 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v22, s31, 16
+; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_writelane_b32 v22, s31, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX942-NEXT: ;;#ASMEND
@@ -1411,15 +1411,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
; GFX10_1-NEXT: v_writelane_b32 v22, s33, 0
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5
-; GFX10_1-NEXT: s_add_i32 s58, s4, 0x4240
; GFX10_1-NEXT: v_writelane_b32 v22, s34, 1
-; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use alloca0 v0
-; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_writelane_b32 v22, s35, 2
; GFX10_1-NEXT: v_writelane_b32 v22, s36, 3
; GFX10_1-NEXT: v_writelane_b32 v22, s37, 4
@@ -1435,6 +1427,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: v_writelane_b32 v22, s55, 14
; GFX10_1-NEXT: v_writelane_b32 v22, s30, 15
; GFX10_1-NEXT: v_writelane_b32 v22, s31, 16
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5
+; GFX10_1-NEXT: s_add_i32 s58, s4, 0x4240
+; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT: ;;#ASMSTART
+; GFX10_1-NEXT: ; use alloca0 v0
+; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX10_1-NEXT: ;;#ASMEND
@@ -1476,15 +1476,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
; GFX10_3-NEXT: v_writelane_b32 v22, s33, 0
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5
-; GFX10_3-NEXT: s_add_i32 s58, s4, 0x4240
; GFX10_3-NEXT: v_writelane_b32 v22, s34, 1
-; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use alloca0 v0
-; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_writelane_b32 v22, s35, 2
; GFX10_3-NEXT: v_writelane_b32 v22, s36, 3
; GFX10_3-NEXT: v_writelane_b32 v22, s37, 4
@@ -1500,6 +1492,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: v_writelane_b32 v22, s55, 14
; GFX10_3-NEXT: v_writelane_b32 v22, s30, 15
; GFX10_3-NEXT: v_writelane_b32 v22, s31, 16
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5
+; GFX10_3-NEXT: s_add_i32 s58, s4, 0x4240
+; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT: ;;#ASMSTART
+; GFX10_3-NEXT: ; use alloca0 v0
+; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX10_3-NEXT: ;;#ASMEND
@@ -1540,14 +1540,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: scratch_store_b32 off, v22, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v22, s33, 0
-; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: s_add_i32 s58, s32, 0x4240
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: v_writelane_b32 v22, s34, 1
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use alloca0 v0
-; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: v_writelane_b32 v22, s35, 2
; GFX11-NEXT: v_writelane_b32 v22, s36, 3
; GFX11-NEXT: v_writelane_b32 v22, s37, 4
@@ -1563,6 +1556,13 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: v_writelane_b32 v22, s55, 14
; GFX11-NEXT: v_writelane_b32 v22, s30, 15
; GFX11-NEXT: v_writelane_b32 v22, s31, 16
+; GFX11-NEXT: s_add_i32 s0, s32, 64
+; GFX11-NEXT: s_add_i32 s58, s32, 0x4240
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use alloca0 v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX11-NEXT: ;;#ASMEND
@@ -1571,7 +1571,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v22, 15
; GFX11-NEXT: v_readlane_b32 s31, v22, 16
; GFX11-NEXT: v_readlane_b32 s55, v22, 14
@@ -1608,12 +1607,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v22, s33, 0
-; GFX12-NEXT: s_add_co_i32 s58, s32, 0x4200
-; GFX12-NEXT: v_mov_b32_e32 v0, s32
-; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use alloca0 v0
-; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: v_writelane_b32 v22, s34, 1
; GFX12-NEXT: v_writelane_b32 v22, s35, 2
; GFX12-NEXT: v_writelane_b32 v22, s36, 3
@@ -1630,6 +1623,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: v_writelane_b32 v22, s55, 14
; GFX12-NEXT: v_writelane_b32 v22, s30, 15
; GFX12-NEXT: v_writelane_b32 v22, s31, 16
+; GFX12-NEXT: s_add_co_i32 s58, s32, 0x4200
+; GFX12-NEXT: v_mov_b32_e32 v0, s32
+; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use alloca0 v0
+; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX12-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
index a19853adabb58..9a9e66f10a62d 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -9,7 +9,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
-define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
+define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) #1 {
; GFX7-LABEL: v_maximumnum_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -228,7 +228,7 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
ret bfloat %result
}
-define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
+define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) #1 {
; GFX7-LABEL: v_maximumnum_bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -377,7 +377,7 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
ret bfloat %result
}
-define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
+define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -762,7 +762,7 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
ret <2 x bfloat> %result
}
-define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) {
+define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v2bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1013,7 +1013,7 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
ret <2 x bfloat> %result
}
-define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
+define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1530,7 +1530,7 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
ret <3 x bfloat> %result
}
-define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) {
+define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v3bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1868,7 +1868,7 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
ret <3 x bfloat> %result
}
-define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
+define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2539,7 +2539,7 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
ret <4 x bfloat> %result
}
-define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) {
+define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v4bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2972,7 +2972,7 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
ret <4 x bfloat> %result
}
-define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
+define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v6bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3954,7 +3954,7 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
ret <6 x bfloat> %result
}
-define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5248,7 +5248,7 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
ret <8 x bfloat> %result
}
-define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7822,7 +7822,7 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
ret <16 x bfloat> %result
}
-define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
+define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) #1 {
; GFX7-LABEL: v_maximumnum_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8025,6 +8025,9 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX8-LABEL: v_maximumnum_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_load_dword v55, off, s[0:3], s32
; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v30
@@ -8076,13 +8079,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX8-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v53, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v54, 0xffff0000, v19
-; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
-; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v35, 16, v55
; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v55
; GFX8-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
@@ -8583,6 +8583,9 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX900-LABEL: v_maximumnum_v32bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX900-NEXT: buffer_load_dword v55, off, s[0:3], s32
; GFX900-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v30
@@ -8634,13 +8637,10 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX900-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
; GFX900-NEXT: v_and_b32_e32 v53, 0xffff0000, v20
; GFX900-NEXT: v_and_b32_e32 v54, 0xffff0000, v19
-; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX900-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
; GFX900-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
; GFX900-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
-; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_lshrrev_b32_e32 v35, 16, v55
; GFX900-NEXT: v_and_b32_e32 v36, 0xffff0000, v55
; GFX900-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
@@ -9126,6 +9126,9 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX950-LABEL: v_maximumnum_v32bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v50, off, s32
; GFX950-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v30
@@ -9177,11 +9180,8 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v21
; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v20
; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v19
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
-; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v50
@@ -14544,6 +14544,8 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
}
attributes #0 = { "amdgpu-ieee"="false" }
+attributes #1 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11: {{.*}}
; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index fc3e46b9f5895..42bbd8602b550 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -32,7 +32,7 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-SDAG,GFX12-FAKE16-SDAG %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s
-define half @v_maximumnum_f16(half %x, half %y) {
+define half @v_maximumnum_f16(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -226,7 +226,7 @@ define half @v_maximumnum_f16(half %x, half %y) {
ret half %result
}
-define half @v_maximumnum_f16_nnan(half %x, half %y) {
+define half @v_maximumnum_f16_nnan(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -310,7 +310,7 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) {
ret half %result
}
-define half @v_maximumnum_f16_1.0(half %x) {
+define half @v_maximumnum_f16_1.0(half %x) #1 {
; GFX7-LABEL: v_maximumnum_f16_1.0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -399,7 +399,7 @@ define half @v_maximumnum_f16_1.0(half %x) {
ret half %result
}
-define float @v_maximumnum_f32(float %x, float %y) {
+define float @v_maximumnum_f32(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f32:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -523,7 +523,7 @@ define float @v_maximumnum_f32(float %x, float %y) {
ret float %result
}
-define float @v_maximumnum_f32_nnan(float %x, float %y) {
+define float @v_maximumnum_f32_nnan(float %x, float %y) #1 {
; GFX7-LABEL: v_maximumnum_f32_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -573,7 +573,7 @@ define float @v_maximumnum_f32_nnan(float %x, float %y) {
ret float %result
}
-define double @v_maximumnum_f64(double %x, double %y) {
+define double @v_maximumnum_f64(double %x, double %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f64:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -703,7 +703,7 @@ define double @v_maximumnum_f64(double %x, double %y) {
ret double %result
}
-define double @v_maximumnum_f64_nnan(double %x, double %y) {
+define double @v_maximumnum_f64_nnan(double %x, double %y) #1 {
; GFX7-LABEL: v_maximumnum_f64_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -753,7 +753,7 @@ define double @v_maximumnum_f64_nnan(double %x, double %y) {
ret double %result
}
-define float @v_maximumnum_f32_1.0(float %x) {
+define float @v_maximumnum_f32_1.0(float %x) #1 {
; GFX7-LABEL: v_maximumnum_f32_1.0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -813,7 +813,7 @@ define float @v_maximumnum_f32_1.0(float %x) {
ret float %result
}
-define float @v_maximumnum_f32_rhs_not_snan(float %x, float %y) {
+define float @v_maximumnum_f32_rhs_not_snan(float %x, float %y) #1 {
; GFX7-LABEL: v_maximumnum_f32_rhs_not_snan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -878,7 +878,7 @@ define float @v_maximumnum_f32_rhs_not_snan(float %x, float %y) {
ret float %result
}
-define float @v_maximumnum_f32_lhs_not_snan(float %x, float %y) {
+define float @v_maximumnum_f32_lhs_not_snan(float %x, float %y) #1 {
; GFX7-LABEL: v_maximumnum_f32_lhs_not_snan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -943,7 +943,7 @@ define float @v_maximumnum_f32_lhs_not_snan(float %x, float %y) {
ret float %result
}
-define float @v_maximumnum_f32_both_operands_not_snan(float %x, float %y) {
+define float @v_maximumnum_f32_both_operands_not_snan(float %x, float %y) #1 {
; GFX7-LABEL: v_maximumnum_f32_both_operands_not_snan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1009,7 +1009,7 @@ define float @v_maximumnum_f32_both_operands_not_snan(float %x, float %y) {
ret float %result
}
-define double @v_maximumnum_f64_1.0(double %x) {
+define double @v_maximumnum_f64_1.0(double %x) #1 {
; GFX7-LABEL: v_maximumnum_f64_1.0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1069,7 +1069,7 @@ define double @v_maximumnum_f64_1.0(double %x) {
ret double %result
}
-define half @v_maximumnum_f16_s_v(half inreg %x, half %y) {
+define half @v_maximumnum_f16_s_v(half inreg %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_s_v:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1279,7 +1279,7 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) {
ret half %result
}
-define half @v_maximumnum_f16_v_s(half %x, half inreg %y) {
+define half @v_maximumnum_f16_v_s(half %x, half inreg %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_v_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1489,7 +1489,7 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) {
ret half %result
}
-define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) {
+define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_s_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1717,7 +1717,7 @@ define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) {
ret half %result
}
-define float @v_maximumnum_f32_s_v(float inreg %x, float %y) {
+define float @v_maximumnum_f32_s_v(float inreg %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f32_s_v:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1863,7 +1863,7 @@ define float @v_maximumnum_f32_s_v(float inreg %x, float %y) {
ret float %result
}
-define float @v_maximumnum_f32_v_s(float %x, float inreg %y) {
+define float @v_maximumnum_f32_v_s(float %x, float inreg %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f32_v_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2009,7 +2009,7 @@ define float @v_maximumnum_f32_v_s(float %x, float inreg %y) {
ret float %result
}
-define float @v_maximumnum_f32_s_s(float inreg %x, float inreg %y) {
+define float @v_maximumnum_f32_s_s(float inreg %x, float inreg %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f32_s_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2164,7 +2164,7 @@ define float @v_maximumnum_f32_s_s(float inreg %x, float inreg %y) {
ret float %result
}
-define double @v_maximumnum_f64_s_v(double inreg %x, double %y) {
+define double @v_maximumnum_f64_s_v(double inreg %x, double %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f64_s_v:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2271,7 +2271,7 @@ define double @v_maximumnum_f64_s_v(double inreg %x, double %y) {
ret double %result
}
-define double @v_maximumnum_f64_v_s(double %x, double inreg %y) {
+define double @v_maximumnum_f64_v_s(double %x, double inreg %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f64_v_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2378,7 +2378,7 @@ define double @v_maximumnum_f64_v_s(double %x, double inreg %y) {
ret double %result
}
-define double @v_maximumnum_f64_s_s(double inreg %x, double inreg %y) {
+define double @v_maximumnum_f64_s_s(double inreg %x, double inreg %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f64_s_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2524,7 +2524,7 @@ define double @v_maximumnum_f64_s_s(double inreg %x, double inreg %y) {
ret double %result
}
-define float @v_maximumnum_f32_fabs_rhs(float %x, float %y) {
+define float @v_maximumnum_f32_fabs_rhs(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2655,7 +2655,7 @@ define float @v_maximumnum_f32_fabs_rhs(float %x, float %y) {
ret float %result
}
-define float @v_maximumnum_f32_fneg_fabs_rhs(float %x, float %y) {
+define float @v_maximumnum_f32_fneg_fabs_rhs(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2787,7 +2787,7 @@ define float @v_maximumnum_f32_fneg_fabs_rhs(float %x, float %y) {
ret float %result
}
-define float @v_maximumnum_f32_fabs(float %x, float %y) {
+define float @v_maximumnum_f32_fabs(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f32_fabs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2919,7 +2919,7 @@ define float @v_maximumnum_f32_fabs(float %x, float %y) {
ret float %result
}
-define float @v_maximumnum_f32_fneg(float %x, float %y) {
+define float @v_maximumnum_f32_fneg(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f32_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3051,7 +3051,7 @@ define float @v_maximumnum_f32_fneg(float %x, float %y) {
ret float %result
}
-define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) {
+define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3246,7 +3246,7 @@ define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) {
ret half %result
}
-define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
+define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3442,7 +3442,7 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
ret half %result
}
-define half @v_maximumnum_f16_fabs(half %x, half %y) {
+define half @v_maximumnum_f16_fabs(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3638,7 +3638,7 @@ define half @v_maximumnum_f16_fabs(half %x, half %y) {
ret half %result
}
-define half @v_maximumnum_f16_fneg(half %x, half %y) {
+define half @v_maximumnum_f16_fneg(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3834,7 +3834,7 @@ define half @v_maximumnum_f16_fneg(half %x, half %y) {
ret half %result
}
-define double @v_maximumnum_f64_fneg(double %x, double %y) {
+define double @v_maximumnum_f64_fneg(double %x, double %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_f64_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3966,7 +3966,7 @@ define double @v_maximumnum_f64_fneg(double %x, double %y) {
ret double %result
}
-define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v2f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4140,7 +4140,7 @@ define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) {
ret <2 x half> %result
}
-define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v2f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4228,7 +4228,7 @@ define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
ret <2 x half> %result
}
-define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
+define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v3f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4440,7 +4440,7 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
ret <3 x half> %result
}
-define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
+define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v3f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4550,7 +4550,7 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
ret <3 x half> %result
}
-define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
+define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v4f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4798,7 +4798,7 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
ret <4 x half> %result
}
-define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
+define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v4f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4921,7 +4921,7 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
ret <4 x half> %result
}
-define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
+define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v6f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5243,7 +5243,7 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
ret <6 x half> %result
}
-define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
+define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v8f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5639,7 +5639,7 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
ret <8 x half> %result
}
-define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
+define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v16f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6325,10 +6325,15 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
ret <16 x half> %result
}
-define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
+define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v32f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX7-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX7-SDAG-NEXT: buffer_load_dword v49, off, s[0:3], s32
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v30
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v14
@@ -6391,13 +6396,8 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: v_max_f32_e32 v53, v54, v53
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v22
+; GFX7-SDAG-NEXT: v_max_f32_e32 v53, v54, v53
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v6
; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v17
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v53
@@ -6413,9 +6413,9 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v44, v44
; GFX7-SDAG-NEXT: v_max_f32_e32 v48, v41, v40
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5)
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v49
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v16
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v49
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v0
; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v19
; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v17
@@ -7676,7 +7676,7 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
ret <32 x half> %result
}
-define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
+define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v2f32:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7830,7 +7830,7 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
ret <2 x float> %result
}
-define <2 x float> @v_maximumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
+define <2 x float> @v_maximumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) #1 {
; GFX7-LABEL: v_maximumnum_v2f32_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7884,7 +7884,7 @@ define <2 x float> @v_maximumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
ret <2 x float> %result
}
-define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
+define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v3f32:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8074,7 +8074,7 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
ret <3 x float> %result
}
-define <3 x float> @v_maximumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
+define <3 x float> @v_maximumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) #1 {
; GFX7-LABEL: v_maximumnum_v3f32_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8135,7 +8135,7 @@ define <3 x float> @v_maximumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
ret <3 x float> %result
}
-define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
+define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v4f32:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8355,7 +8355,7 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
ret <4 x float> %result
}
-define <4 x float> @v_maximumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
+define <4 x float> @v_maximumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) #1 {
; GFX7-LABEL: v_maximumnum_v4f32_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8420,7 +8420,7 @@ define <4 x float> @v_maximumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
ret <4 x float> %result
}
-define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) {
+define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v2f64:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8614,7 +8614,7 @@ define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %result
}
-define <2 x double> @v_maximumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
+define <2 x double> @v_maximumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) #1 {
; GFX7-LABEL: v_maximumnum_v2f64_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8671,7 +8671,7 @@ define <2 x double> @v_maximumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %result
}
-define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) {
+define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v3f64:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8913,7 +8913,7 @@ define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) {
ret <3 x double> %result
}
-define <3 x double> @v_maximumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
+define <3 x double> @v_maximumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) #1 {
; GFX7-LABEL: v_maximumnum_v3f64_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8977,7 +8977,7 @@ define <3 x double> @v_maximumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
ret <3 x double> %result
}
-define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) {
+define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) #1 {
; GFX7-SDAG-LABEL: v_maximumnum_v4f64:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9267,7 +9267,7 @@ define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) {
ret <4 x double> %result
}
-define <4 x double> @v_maximumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) {
+define <4 x double> @v_maximumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) #1 {
; GFX7-LABEL: v_maximumnum_v4f64_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10251,7 +10251,9 @@ define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
ret <4 x half> %result
}
-attributes #0 = { "amdgpu-ieee"="false" }
+attributes #0 = { "amdgpu-ieee"="false" nounwind }
+attributes #1 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX900: {{.*}}
; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index cc0fce9a5261b..6cb28a360d2a3 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -9,7 +9,7 @@
; memcpy for address spaces 0, 1, 4, 5
-define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) #1 {
; CHECK-LABEL: memcpy_p0_p0_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -85,7 +85,6 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; ALIGNED-LABEL: memcpy_p0_p0_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
@@ -103,6 +102,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
; ALIGNED-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4
@@ -794,7 +794,7 @@ entry:
ret void
}
-define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) #1 {
; CHECK-LABEL: memcpy_p1_p1_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -865,7 +865,6 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; ALIGNED-LABEL: memcpy_p1_p1_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -874,6 +873,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
; ALIGNED-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
@@ -1547,7 +1547,7 @@ entry:
ret void
}
-define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) #1 {
; CHECK-LABEL: memcpy_p0_p4_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2161,7 +2161,7 @@ entry:
ret void
}
-define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) #1 {
; CHECK-LABEL: memcpy_p5_p5_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2373,7 +2373,6 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
; ALIGNED-LABEL: memcpy_p5_p5_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -2422,6 +2421,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
; ALIGNED-NEXT: .LBB3_1: ; %static-memcpy-expansion-main-body
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: s_clause 0x34
@@ -3608,7 +3608,7 @@ entry:
ret void
}
-define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) #1 {
; CHECK-LABEL: memcpy_p0_p5_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5371,7 +5371,7 @@ entry:
; memmove for address spaces 0, 1, 4, 5
-define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) #1 {
; CHECK-LABEL: memmove_p0_p0_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6916,7 +6916,7 @@ entry:
ret void
}
-define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) #1 {
; CHECK-LABEL: memmove_p1_p1_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8431,7 +8431,7 @@ entry:
ret void
}
-define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) #1 {
; CHECK-LABEL: memmove_p0_p4_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9674,7 +9674,7 @@ entry:
ret void
}
-define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) #1 {
; CHECK-LABEL: memmove_p5_p5_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12475,7 +12475,7 @@ entry:
ret void
}
-define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) #1 {
; CHECK-LABEL: memmove_p0_p5_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12705,8 +12705,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-LABEL: memmove_p0_p5_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v69, v1
-; ALIGNED-NEXT: v_mov_b32_e32 v68, v0
; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -12755,6 +12753,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_mov_b32_e32 v69, v1
+; ALIGNED-NEXT: v_mov_b32_e32 v68, v0
; ALIGNED-NEXT: s_mov_b32 s4, exec_lo
; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[68:69]
; ALIGNED-NEXT: v_cndmask_b32_e32 v3, -1, v68, vcc_lo
@@ -15939,7 +15939,7 @@ entry:
ret void
}
-define void @memset_p0_sz2048(ptr addrspace(0) %dst) {
+define void @memset_p0_sz2048(ptr addrspace(0) %dst) #1 {
; CHECK-LABEL: memset_p0_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16370,7 +16370,7 @@ entry:
ret void
}
-define void @memset_p1_sz2048(ptr addrspace(1) %dst) {
+define void @memset_p1_sz2048(ptr addrspace(1) %dst) #1 {
; CHECK-LABEL: memset_p1_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16794,7 +16794,7 @@ entry:
ret void
}
-define void @memset_p3_sz2048(ptr addrspace(3) %dst) {
+define void @memset_p3_sz2048(ptr addrspace(3) %dst) #1 {
; CHECK-LABEL: memset_p3_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17153,7 +17153,7 @@ entry:
ret void
}
-define void @memset_p5_sz2048(ptr addrspace(5) %dst) {
+define void @memset_p5_sz2048(ptr addrspace(5) %dst) #1 {
; CHECK-LABEL: memset_p5_sz2048:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17569,5 +17569,6 @@ declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, i8, i64,
declare void @llvm.memset.p3.i64(ptr addrspace(3) nocapture writeonly, i8, i64, i1 immarg) #3
declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) #3
+attributes #1 = { nounwind }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
index c1877ac246521..5752bf90cb20f 100644
--- a/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
@@ -3,7 +3,7 @@
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
-define void @memset_p0_varsize_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval, i64 %size) {
+define void @memset_p0_varsize_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval, i64 %size) #0 {
; GFX942-SDAG-LABEL: memset_p0_varsize_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -128,7 +128,7 @@ entry:
ret void
}
-define void @memset_p1_varsize_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval, i64 %size) {
+define void @memset_p1_varsize_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval, i64 %size) #0 {
; GFX942-SDAG-LABEL: memset_p1_varsize_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -253,7 +253,7 @@ entry:
ret void
}
-define void @memset_p3_varsize_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval, i64 %size) {
+define void @memset_p3_varsize_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval, i64 %size) #0 {
; GFX942-SDAG-LABEL: memset_p3_varsize_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -372,7 +372,7 @@ entry:
ret void
}
-define void @memset_p5_varsize_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval, i64 %size) {
+define void @memset_p5_varsize_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval, i64 %size) #0 {
; GFX942-SDAG-LABEL: memset_p5_varsize_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -490,12 +490,10 @@ entry:
ret void
}
-define void @memset_p0_sz1055_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval) {
+define void @memset_p0_sz1055_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval) #0 {
; GFX942-SDAG-LABEL: memset_p0_sz1055_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
-; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
@@ -510,6 +508,8 @@ define void @memset_p0_sz1055_align_4_varsetval(ptr addrspace(0) align 4 %dst, i
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4
; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4
; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4
@@ -683,12 +683,10 @@ entry:
ret void
}
-define void @memset_p0_sz2048_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval) {
+define void @memset_p0_sz2048_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval) #0 {
; GFX942-SDAG-LABEL: memset_p0_sz2048_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
-; GFX942-SDAG-NEXT: v_perm_b32 v2, v2, v2, s0
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
@@ -701,6 +699,8 @@ define void @memset_p0_sz2048_align_4_varsetval(ptr addrspace(0) align 4 %dst, i
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v2, v2, v2, s0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2
; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2
; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2
@@ -843,7 +843,7 @@ entry:
ret void
}
-define void @memset_p1_sz1055_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval) {
+define void @memset_p1_sz1055_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval) #0 {
; GFX942-SDAG-LABEL: memset_p1_sz1055_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -987,7 +987,7 @@ entry:
ret void
}
-define void @memset_p1_sz2048_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval) {
+define void @memset_p1_sz2048_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval) #0 {
; GFX942-SDAG-LABEL: memset_p1_sz2048_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1102,7 +1102,7 @@ entry:
ret void
}
-define void @memset_p3_sz1055_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval) {
+define void @memset_p3_sz1055_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval) #0 {
; GFX942-SDAG-LABEL: memset_p3_sz1055_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1263,7 +1263,7 @@ entry:
ret void
}
-define void @memset_p3_sz2048_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval) {
+define void @memset_p3_sz2048_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval) #0 {
; GFX942-SDAG-LABEL: memset_p3_sz2048_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1390,7 +1390,7 @@ entry:
ret void
}
-define void @memset_p5_sz1055_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval) {
+define void @memset_p5_sz1055_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval) #0 {
; GFX942-SDAG-LABEL: memset_p5_sz1055_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1535,7 +1535,7 @@ entry:
ret void
}
-define void @memset_p5_sz2048_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval) {
+define void @memset_p5_sz2048_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval) #0 {
; GFX942-SDAG-LABEL: memset_p5_sz2048_align_4_varsetval:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1646,7 +1646,7 @@ entry:
ret void
}
-define void @memset_p1_varsz_align_4_set40(ptr addrspace(1) align 4 %dst, i64 %size) {
+define void @memset_p1_varsz_align_4_set40(ptr addrspace(1) align 4 %dst, i64 %size) #0 {
; GFX942-SDAG-LABEL: memset_p1_varsz_align_4_set40:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1766,7 +1766,7 @@ entry:
ret void
}
-define void @memset_p1_varsz_align_4_set0(ptr addrspace(1) align 4 %dst, i64 %size) {
+define void @memset_p1_varsz_align_4_set0(ptr addrspace(1) align 4 %dst, i64 %size) #0 {
; GFX942-SDAG-LABEL: memset_p1_varsz_align_4_set0:
; GFX942-SDAG: ; %bb.0: ; %entry
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1891,6 +1891,7 @@ declare void @llvm.memset.p1.i64(ptr addrspace(1) noalias nocapture writeonly, i
declare void @llvm.memset.p3.i64(ptr addrspace(3) noalias nocapture writeonly, i8, i64, i1 immarg)
declare void @llvm.memset.p5.i64(ptr addrspace(5) noalias nocapture writeonly, i8, i64, i1 immarg)
+attributes #0 = { nounwind }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX942: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
index ba0617dac5d0d..8fe4249ce1d64 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -9,7 +9,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
-define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
+define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) #1 {
; GFX7-LABEL: v_minimumnum_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -230,7 +230,7 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
ret bfloat %result
}
-define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
+define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) #1 {
; GFX7-LABEL: v_minimumnum_bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -383,7 +383,7 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
}
-define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
+define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -771,7 +771,7 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
ret <2 x bfloat> %result
}
-define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) {
+define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v2bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1025,7 +1025,7 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
ret <2 x bfloat> %result
}
-define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
+define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1545,7 +1545,7 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
ret <3 x bfloat> %result
}
-define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) {
+define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v3bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1885,7 +1885,7 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
ret <3 x bfloat> %result
}
-define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
+define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2559,7 +2559,7 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
ret <4 x bfloat> %result
}
-define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) {
+define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v4bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2995,7 +2995,7 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
ret <4 x bfloat> %result
}
-define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
+define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v6bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3980,7 +3980,7 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
ret <6 x bfloat> %result
}
-define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5276,7 +5276,7 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
ret <8 x bfloat> %result
}
-define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7852,7 +7852,7 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
ret <16 x bfloat> %result
}
-define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
+define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) #1 {
; GFX7-LABEL: v_minimumnum_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8055,6 +8055,9 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX8-LABEL: v_minimumnum_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_load_dword v55, off, s[0:3], s32
; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v30
@@ -8103,13 +8106,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX8-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v53, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v54, 0xffff0000, v19
-; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
-; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v35, 16, v55
; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v55
; GFX8-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
@@ -8614,6 +8614,9 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX900-LABEL: v_minimumnum_v32bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX900-NEXT: buffer_load_dword v55, off, s[0:3], s32
; GFX900-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v30
@@ -8662,13 +8665,10 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX900-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
; GFX900-NEXT: v_and_b32_e32 v53, 0xffff0000, v20
; GFX900-NEXT: v_and_b32_e32 v54, 0xffff0000, v19
-; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX900-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
; GFX900-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
; GFX900-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
-; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_lshrrev_b32_e32 v35, 16, v55
; GFX900-NEXT: v_and_b32_e32 v36, 0xffff0000, v55
; GFX900-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
@@ -9158,6 +9158,9 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX950-LABEL: v_minimumnum_v32bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
; GFX950-NEXT: scratch_load_dword v50, off, s32
; GFX950-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v30
@@ -9206,11 +9209,8 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v21
; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v20
; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v19
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
-; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
; GFX950-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v50
@@ -14587,7 +14587,9 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
ret <4 x bfloat> %result
}
-attributes #0 = { "amdgpu-ieee"="false" }
+attributes #0 = { "amdgpu-ieee"="false" nounwind }
+attributes #1 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11: {{.*}}
; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 21cbff0b17ec6..35cd1dcaf380f 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -32,7 +32,7 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-SDAG,GFX12-FAKE16-SDAG %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s
-define half @v_minimumnum_f16(half %x, half %y) {
+define half @v_minimumnum_f16(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -226,7 +226,7 @@ define half @v_minimumnum_f16(half %x, half %y) {
ret half %result
}
-define half @v_minimumnum_f16_nnan(half %x, half %y) {
+define half @v_minimumnum_f16_nnan(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -310,7 +310,7 @@ define half @v_minimumnum_f16_nnan(half %x, half %y) {
ret half %result
}
-define half @v_minimumnum_f16_1.0(half %x) {
+define half @v_minimumnum_f16_1.0(half %x) #1 {
; GFX7-LABEL: v_minimumnum_f16_1.0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -399,7 +399,7 @@ define half @v_minimumnum_f16_1.0(half %x) {
ret half %result
}
-define float @v_minimumnum_f32(float %x, float %y) {
+define float @v_minimumnum_f32(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f32:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -523,7 +523,7 @@ define float @v_minimumnum_f32(float %x, float %y) {
ret float %result
}
-define float @v_minimumnum_f32_nnan(float %x, float %y) {
+define float @v_minimumnum_f32_nnan(float %x, float %y) #1 {
; GFX7-LABEL: v_minimumnum_f32_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -573,7 +573,7 @@ define float @v_minimumnum_f32_nnan(float %x, float %y) {
ret float %result
}
-define double @v_minimumnum_f64(double %x, double %y) {
+define double @v_minimumnum_f64(double %x, double %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f64:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -703,7 +703,7 @@ define double @v_minimumnum_f64(double %x, double %y) {
ret double %result
}
-define double @v_minimumnum_f64_nnan(double %x, double %y) {
+define double @v_minimumnum_f64_nnan(double %x, double %y) #1 {
; GFX7-LABEL: v_minimumnum_f64_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -753,7 +753,7 @@ define double @v_minimumnum_f64_nnan(double %x, double %y) {
ret double %result
}
-define float @v_minimumnum_f32_1.0(float %x) {
+define float @v_minimumnum_f32_1.0(float %x) #1 {
; GFX7-LABEL: v_minimumnum_f32_1.0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -813,7 +813,7 @@ define float @v_minimumnum_f32_1.0(float %x) {
ret float %result
}
-define float @v_minimumnum_f32_rhs_not_snan(float %x, float %y) {
+define float @v_minimumnum_f32_rhs_not_snan(float %x, float %y) #1 {
; GFX7-LABEL: v_minimumnum_f32_rhs_not_snan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -878,7 +878,7 @@ define float @v_minimumnum_f32_rhs_not_snan(float %x, float %y) {
ret float %result
}
-define float @v_minimumnum_f32_lhs_not_snan(float %x, float %y) {
+define float @v_minimumnum_f32_lhs_not_snan(float %x, float %y) #1 {
; GFX7-LABEL: v_minimumnum_f32_lhs_not_snan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -943,7 +943,7 @@ define float @v_minimumnum_f32_lhs_not_snan(float %x, float %y) {
ret float %result
}
-define float @v_minimumnum_f32_both_operands_not_snan(float %x, float %y) {
+define float @v_minimumnum_f32_both_operands_not_snan(float %x, float %y) #1 {
; GFX7-LABEL: v_minimumnum_f32_both_operands_not_snan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1009,7 +1009,7 @@ define float @v_minimumnum_f32_both_operands_not_snan(float %x, float %y) {
ret float %result
}
-define double @v_minimumnum_f64_1.0(double %x) {
+define double @v_minimumnum_f64_1.0(double %x) #1 {
; GFX7-LABEL: v_minimumnum_f64_1.0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1069,7 +1069,7 @@ define double @v_minimumnum_f64_1.0(double %x) {
ret double %result
}
-define half @v_minimumnum_f16_v_s(half %x, half inreg %y) {
+define half @v_minimumnum_f16_v_s(half %x, half inreg %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_v_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1279,7 +1279,7 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) {
ret half %result
}
-define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) {
+define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_s_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1507,7 +1507,7 @@ define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) {
ret half %result
}
-define float @v_minimumnum_f32_s_v(float inreg %x, float %y) {
+define float @v_minimumnum_f32_s_v(float inreg %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f32_s_v:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1653,7 +1653,7 @@ define float @v_minimumnum_f32_s_v(float inreg %x, float %y) {
ret float %result
}
-define float @v_minimumnum_f32_v_s(float %x, float inreg %y) {
+define float @v_minimumnum_f32_v_s(float %x, float inreg %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f32_v_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1799,7 +1799,7 @@ define float @v_minimumnum_f32_v_s(float %x, float inreg %y) {
ret float %result
}
-define float @v_minimumnum_f32_s_s(float inreg %x, float inreg %y) {
+define float @v_minimumnum_f32_s_s(float inreg %x, float inreg %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f32_s_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1954,7 +1954,7 @@ define float @v_minimumnum_f32_s_s(float inreg %x, float inreg %y) {
ret float %result
}
-define double @v_minimumnum_f64_s_v(double inreg %x, double %y) {
+define double @v_minimumnum_f64_s_v(double inreg %x, double %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f64_s_v:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2061,7 +2061,7 @@ define double @v_minimumnum_f64_s_v(double inreg %x, double %y) {
ret double %result
}
-define double @v_minimumnum_f64_v_s(double %x, double inreg %y) {
+define double @v_minimumnum_f64_v_s(double %x, double inreg %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f64_v_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2168,7 +2168,7 @@ define double @v_minimumnum_f64_v_s(double %x, double inreg %y) {
ret double %result
}
-define double @v_minimumnum_f64_s_s(double inreg %x, double inreg %y) {
+define double @v_minimumnum_f64_s_s(double inreg %x, double inreg %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f64_s_s:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2314,7 +2314,7 @@ define double @v_minimumnum_f64_s_s(double inreg %x, double inreg %y) {
ret double %result
}
-define float @v_minimumnum_f32_fabs_rhs(float %x, float %y) {
+define float @v_minimumnum_f32_fabs_rhs(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2445,7 +2445,7 @@ define float @v_minimumnum_f32_fabs_rhs(float %x, float %y) {
ret float %result
}
-define float @v_minimumnum_f32_fneg_fabs_rhs(float %x, float %y) {
+define float @v_minimumnum_f32_fneg_fabs_rhs(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2577,7 +2577,7 @@ define float @v_minimumnum_f32_fneg_fabs_rhs(float %x, float %y) {
ret float %result
}
-define float @v_minimumnum_f32_fabs(float %x, float %y) {
+define float @v_minimumnum_f32_fabs(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f32_fabs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2709,7 +2709,7 @@ define float @v_minimumnum_f32_fabs(float %x, float %y) {
ret float %result
}
-define float @v_minimumnum_f32_fneg(float %x, float %y) {
+define float @v_minimumnum_f32_fneg(float %x, float %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f32_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2841,7 +2841,7 @@ define float @v_minimumnum_f32_fneg(float %x, float %y) {
ret float %result
}
-define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) {
+define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3036,7 +3036,7 @@ define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) {
ret half %result
}
-define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
+define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3232,7 +3232,7 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
ret half %result
}
-define half @v_minimumnum_f16_fabs(half %x, half %y) {
+define half @v_minimumnum_f16_fabs(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3428,7 +3428,7 @@ define half @v_minimumnum_f16_fabs(half %x, half %y) {
ret half %result
}
-define half @v_minimumnum_f16_fneg(half %x, half %y) {
+define half @v_minimumnum_f16_fneg(half %x, half %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3624,7 +3624,7 @@ define half @v_minimumnum_f16_fneg(half %x, half %y) {
ret half %result
}
-define double @v_minimumnum_f64_fneg(double %x, double %y) {
+define double @v_minimumnum_f64_fneg(double %x, double %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_f64_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3756,7 +3756,7 @@ define double @v_minimumnum_f64_fneg(double %x, double %y) {
ret double %result
}
-define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v2f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3930,7 +3930,7 @@ define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) {
ret <2 x half> %result
}
-define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v2f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4018,7 +4018,7 @@ define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
ret <2 x half> %result
}
-define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
+define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v3f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4230,7 +4230,7 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
ret <3 x half> %result
}
-define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
+define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v3f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4340,7 +4340,7 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
ret <3 x half> %result
}
-define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
+define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v4f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4588,7 +4588,7 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
ret <4 x half> %result
}
-define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
+define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v4f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4711,7 +4711,7 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
ret <4 x half> %result
}
-define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
+define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v6f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5033,7 +5033,7 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
ret <6 x half> %result
}
-define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
+define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v8f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5429,7 +5429,7 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
ret <8 x half> %result
}
-define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
+define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v16f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6115,10 +6115,15 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
ret <16 x half> %result
}
-define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
+define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v32f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX7-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX7-SDAG-NEXT: buffer_load_dword v49, off, s[0:3], s32
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v30
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v14
@@ -6181,13 +6186,8 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX7-SDAG-NEXT: v_min_f32_e32 v53, v54, v53
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v22
+; GFX7-SDAG-NEXT: v_min_f32_e32 v53, v54, v53
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v6
; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v17
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v53
@@ -6203,9 +6203,9 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v44, v44
; GFX7-SDAG-NEXT: v_min_f32_e32 v48, v41, v40
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5)
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v49
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v16
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v49
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v0
; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v19
; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v17
@@ -7466,7 +7466,7 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
ret <32 x half> %result
}
-define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
+define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v2f32:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7620,7 +7620,7 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
ret <2 x float> %result
}
-define <2 x float> @v_minimumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
+define <2 x float> @v_minimumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) #1 {
; GFX7-LABEL: v_minimumnum_v2f32_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7674,7 +7674,7 @@ define <2 x float> @v_minimumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
ret <2 x float> %result
}
-define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
+define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v3f32:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7864,7 +7864,7 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
ret <3 x float> %result
}
-define <3 x float> @v_minimumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
+define <3 x float> @v_minimumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) #1 {
; GFX7-LABEL: v_minimumnum_v3f32_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7925,7 +7925,7 @@ define <3 x float> @v_minimumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
ret <3 x float> %result
}
-define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
+define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v4f32:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8145,7 +8145,7 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
ret <4 x float> %result
}
-define <4 x float> @v_minimumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
+define <4 x float> @v_minimumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) #1 {
; GFX7-LABEL: v_minimumnum_v4f32_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8210,7 +8210,7 @@ define <4 x float> @v_minimumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
ret <4 x float> %result
}
-define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) {
+define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v2f64:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8404,7 +8404,7 @@ define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %result
}
-define <2 x double> @v_minimumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
+define <2 x double> @v_minimumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) #1 {
; GFX7-LABEL: v_minimumnum_v2f64_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8461,7 +8461,7 @@ define <2 x double> @v_minimumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %result
}
-define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) {
+define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v3f64:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8703,7 +8703,7 @@ define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) {
ret <3 x double> %result
}
-define <3 x double> @v_minimumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
+define <3 x double> @v_minimumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) #1 {
; GFX7-LABEL: v_minimumnum_v3f64_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8767,7 +8767,7 @@ define <3 x double> @v_minimumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
ret <3 x double> %result
}
-define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) {
+define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) #1 {
; GFX7-SDAG-LABEL: v_minimumnum_v4f64:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9057,7 +9057,7 @@ define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) {
ret <4 x double> %result
}
-define <4 x double> @v_minimumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) {
+define <4 x double> @v_minimumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) #1 {
; GFX7-LABEL: v_minimumnum_v4f64_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10041,7 +10041,9 @@ define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
ret <4 x half> %result
}
-attributes #0 = { "amdgpu-ieee"="false" }
+attributes #0 = { "amdgpu-ieee"="false" nounwind }
+attributes #1 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX900: {{.*}}
; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 87a5012dda7e2..57981d1763959 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -195,21 +195,21 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: v_writelane_b32 v43, s4, 5
; GFX9-NEXT: s_addk_i32 s32, 0x800
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v43, s34, 0
; GFX9-NEXT: v_writelane_b32 v43, s36, 1
+; GFX9-NEXT: v_writelane_b32 v43, s37, 2
+; GFX9-NEXT: v_writelane_b32 v43, s30, 3
+; GFX9-NEXT: v_writelane_b32 v43, s31, 4
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v43, s37, 2
; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, v1
; GFX9-NEXT: v_mov_b32_e32 v41, v0
-; GFX9-NEXT: v_writelane_b32 v43, s30, 3
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
-; GFX9-NEXT: v_writelane_b32 v43, s31, 4
; GFX9-NEXT: s_mov_b32 s34, s15
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 8394b325bee6d..da9463b1329c7 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -18,15 +18,15 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s16, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, external_void_func_i32 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, external_void_func_i32 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -52,20 +52,20 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
-; GCN-NEXT: s_addk_i32 s32, 0x1400
; GCN-NEXT: v_writelane_b32 v40, s16, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x1400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, external_void_func_i32 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, external_void_func_i32 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 0da206e2485c4..2a73ed58a6207 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -233,6 +233,7 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: .cfi_llvm_vector_registers 16, 2623, 0, 32, 2623, 1, 32
; CHECK-NEXT: .Ltmp0:
; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3
; CHECK-NEXT: s_getpc_b64 s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
index cf9cfc47f10f8..418aa28f46ae1 100644
--- a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
@@ -24,12 +24,12 @@ define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v4, s30, 0
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v4, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, func_f32 at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, func_f32 at rel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v4, s31, 1
; CHECK-NEXT: v_mov_b32_e32 v2, v0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v3, v0
@@ -71,12 +71,12 @@ define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v6, s30, 0
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v6, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, func_v2f32 at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v6, s31, 1
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -122,12 +122,12 @@ define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v6, s30, 0
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v6, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, func_f64 at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, func_f64 at rel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v6, s31, 1
; CHECK-NEXT: v_mov_b32_e32 v4, v1
; CHECK-NEXT: v_mov_b32_e32 v5, v0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -199,14 +199,14 @@ define nofpclass(nan inf) { double, double } @aggregate() #0 {
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, aggregate at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, aggregate at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; CHECK-NEXT: v_writelane_b32 v40, s30, 0
-; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
@@ -237,12 +237,12 @@ define { float, float } @aggregate_use(float %z) #0 {
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v41, s16, 2
; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v41, s30, 0
+; CHECK-NEXT: v_writelane_b32 v41, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, aggregate_f32 at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, aggregate_f32 at rel32@hi+12
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v41, s31, 1
; CHECK-NEXT: v_mov_b32_e32 v40, v0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_max_f32_e32 v2, v40, v40
@@ -295,12 +295,12 @@ define <5 x double> @call_nofpclass_funcs_v5f64_non_mvt_vector(ptr addrspace(1)
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v24, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v24, s30, 0
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v24, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, func_v5f64 at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, func_v5f64 at rel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v24, s31, 1
; CHECK-NEXT: v_mov_b32_e32 v22, v1
; CHECK-NEXT: v_mov_b32_e32 v23, v0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
index cd866ee59a956..5f46e4de24461 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
@@ -39,6 +39,7 @@ body: |
; GCN: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10
; GCN-NEXT: {{ $}}
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr10, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr10, 32, $exec_lo, 32, 0
; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
diff --git a/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir b/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir
index 4444eab433b9f..bc8984c6d3e87 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir
@@ -25,15 +25,57 @@ machineFunctionInfo:
body: |
bb.0:
liveins: $sgpr30_sgpr31
- ; CHECK-LABEL: name: one_block
- ; CHECK: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $m0 = S_MOV_B32 9
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
- ; CHECK-NEXT: $m0 = S_MOV_B32 9
- ; CHECK-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ; W32-LABEL: name: one_block
+ ; W32: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: $m0 = S_MOV_B32 9
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec_lo, 32, 0
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr43
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec_lo, 32, 96
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr72
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr73
+ ; W32-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
+ ; W32-NEXT: $m0 = S_MOV_B32 9
+ ; W32-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ;
+ ; W64-LABEL: name: one_block
+ ; W64: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: $m0 = S_MOV_B32 9
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 0
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr43
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 192
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr72
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr73
+ ; W64-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
+ ; W64-NEXT: $m0 = S_MOV_B32 9
+ ; W64-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
S_SETPC_B64_return $sgpr30_sgpr31
...
@@ -49,15 +91,57 @@ machineFunctionInfo:
body: |
bb.0:
liveins: $sgpr30_sgpr31
- ; CHECK-LABEL: name: one_block_csr_only
- ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $m0 = S_MOV_B32 16711935
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66
- ; CHECK-NEXT: $m0 = S_MOV_B32 16711935
- ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ; W32-LABEL: name: one_block_csr_only
+ ; W32: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: $m0 = S_MOV_B32 16711935
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec_lo, 32, 0
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec_lo, 32, 32
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec_lo, 32, 64
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec_lo, 32, 96
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec_lo, 32, 128
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec_lo, 32, 160
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec_lo, 32, 192
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec_lo, 32, 224
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec_lo, 32, 512
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec_lo, 32, 544
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec_lo, 32, 576
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec_lo, 32, 608
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec_lo, 32, 640
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec_lo, 32, 672
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec_lo, 32, 704
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec_lo, 32, 736
+ ; W32-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66
+ ; W32-NEXT: $m0 = S_MOV_B32 16711935
+ ; W32-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ;
+ ; W64-LABEL: name: one_block_csr_only
+ ; W64: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: $m0 = S_MOV_B32 16711935
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 0
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 64
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 128
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 192
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr44, 32, $exec, 64, 256
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 320
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr46, 32, $exec, 64, 384
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr47, 32, $exec, 64, 448
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr56, 32, $exec, 64, 1024
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr57, 32, $exec, 64, 1088
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr58, 32, $exec, 64, 1152
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr59, 32, $exec, 64, 1216
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr60, 32, $exec, 64, 1280
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr61, 32, $exec, 64, 1344
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr62, 32, $exec, 64, 1408
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr63, 32, $exec, 64, 1472
+ ; W64-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66
+ ; W64-NEXT: $m0 = S_MOV_B32 16711935
+ ; W64-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66
S_SETPC_B64_return $sgpr30_sgpr31
...
@@ -77,23 +161,137 @@ machineFunctionInfo:
body: |
bb.0:
liveins: $sgpr30_sgpr31
- ; CHECK-LABEL: name: multiple_blocks
- ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $m0 = S_MOV_B32 3
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: $m0 = S_MOV_B32 65
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.1, align 4, addrspace 5)
- ; CHECK-NEXT: $m0 = S_MOV_B32 1
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.2, align 4, addrspace 5)
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr104, implicit-def $vgpr110, implicit-def $vgpr232
- ; CHECK-NEXT: $m0 = S_MOV_B32 1
- ; CHECK-NEXT: $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr233, implicit $vgpr234, implicit $vgpr235, implicit $vgpr236, implicit $vgpr237, implicit $vgpr238, implicit $vgpr239, implicit $vgpr248, implicit $vgpr249, implicit $vgpr250, implicit $vgpr251, implicit $vgpr252, implicit $vgpr253, implicit $vgpr254, implicit $vgpr255 :: ("amdgpu-thread-private" load (s1024) from %stack.2, align 4, addrspace 5)
- ; CHECK-NEXT: $m0 = S_MOV_B32 65
- ; CHECK-NEXT: $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr105, implicit $vgpr106, implicit $vgpr107, implicit $vgpr108, implicit $vgpr109, implicit $vgpr111, implicit $vgpr120, implicit $vgpr121, implicit $vgpr122, implicit $vgpr123, implicit $vgpr124, implicit $vgpr125, implicit $vgpr126, implicit $vgpr127 :: ("amdgpu-thread-private" load (s1024) from %stack.1, align 4, addrspace 5)
- ; CHECK-NEXT: $m0 = S_MOV_B32 3
- ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ; W32-LABEL: name: multiple_blocks
+ ; W32: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: $m0 = S_MOV_B32 3
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec_lo, 32, 1024
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec_lo, 32, 1056
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr42
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr43
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr45
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W32-NEXT: $m0 = S_MOV_B32 65
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.1, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr104, 32, $exec_lo, 32, 128
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr105
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr106
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr107
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr108
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr109
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr110, 32, $exec_lo, 32, 320
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr111
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr120
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr121
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr122
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr123
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr124
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr125
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr126
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr127
+ ; W32-NEXT: $m0 = S_MOV_B32 1
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.2, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr232, 32, $exec_lo, 32, 0
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr233
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr234
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr235
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr236
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr237
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr238
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr239
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr248
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr249
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr250
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr251
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr252
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr253
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr254
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr255
+ ; W32-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr104, implicit-def $vgpr110, implicit-def $vgpr232
+ ; W32-NEXT: $m0 = S_MOV_B32 1
+ ; W32-NEXT: $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr233, implicit $vgpr234, implicit $vgpr235, implicit $vgpr236, implicit $vgpr237, implicit $vgpr238, implicit $vgpr239, implicit $vgpr248, implicit $vgpr249, implicit $vgpr250, implicit $vgpr251, implicit $vgpr252, implicit $vgpr253, implicit $vgpr254, implicit $vgpr255 :: ("amdgpu-thread-private" load (s1024) from %stack.2, align 4, addrspace 5)
+ ; W32-NEXT: $m0 = S_MOV_B32 65
+ ; W32-NEXT: $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr105, implicit $vgpr106, implicit $vgpr107, implicit $vgpr108, implicit $vgpr109, implicit $vgpr111, implicit $vgpr120, implicit $vgpr121, implicit $vgpr122, implicit $vgpr123, implicit $vgpr124, implicit $vgpr125, implicit $vgpr126, implicit $vgpr127 :: ("amdgpu-thread-private" load (s1024) from %stack.1, align 4, addrspace 5)
+ ; W32-NEXT: $m0 = S_MOV_B32 3
+ ; W32-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ;
+ ; W64-LABEL: name: multiple_blocks
+ ; W64: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: $m0 = S_MOV_B32 3
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 2048
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 2112
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr42
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr43
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr45
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W64-NEXT: $m0 = S_MOV_B32 65
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.1, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr104, 32, $exec, 64, 256
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr105
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr106
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr107
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr108
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr109
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr110, 32, $exec, 64, 640
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr111
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr120
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr121
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr122
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr123
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr124
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr125
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr126
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr127
+ ; W64-NEXT: $m0 = S_MOV_B32 1
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.2, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr232, 32, $exec, 64, 0
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr233
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr234
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr235
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr236
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr237
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr238
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr239
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr248
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr249
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr250
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr251
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr252
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr253
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr254
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr255
+ ; W64-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr104, implicit-def $vgpr110, implicit-def $vgpr232
+ ; W64-NEXT: $m0 = S_MOV_B32 1
+ ; W64-NEXT: $vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255_vgpr256_vgpr257_vgpr258_vgpr259_vgpr260_vgpr261_vgpr262_vgpr263 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr233, implicit $vgpr234, implicit $vgpr235, implicit $vgpr236, implicit $vgpr237, implicit $vgpr238, implicit $vgpr239, implicit $vgpr248, implicit $vgpr249, implicit $vgpr250, implicit $vgpr251, implicit $vgpr252, implicit $vgpr253, implicit $vgpr254, implicit $vgpr255 :: ("amdgpu-thread-private" load (s1024) from %stack.2, align 4, addrspace 5)
+ ; W64-NEXT: $m0 = S_MOV_B32 65
+ ; W64-NEXT: $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr105, implicit $vgpr106, implicit $vgpr107, implicit $vgpr108, implicit $vgpr109, implicit $vgpr111, implicit $vgpr120, implicit $vgpr121, implicit $vgpr122, implicit $vgpr123, implicit $vgpr124, implicit $vgpr125, implicit $vgpr126, implicit $vgpr127 :: ("amdgpu-thread-private" load (s1024) from %stack.1, align 4, addrspace 5)
+ ; W64-NEXT: $m0 = S_MOV_B32 3
+ ; W64-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr104, implicit-def $vgpr110, implicit-def $vgpr232
S_SETPC_B64_return $sgpr30_sgpr31
...
@@ -111,19 +309,97 @@ machineFunctionInfo:
body: |
bb.0:
liveins: $sgpr30_sgpr31
- ; CHECK-LABEL: name: reg_tuples
- ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $m0 = S_MOV_B32 7
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: $m0 = S_MOV_B32 3
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.1, align 4, addrspace 5)
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42, implicit-def $vgpr70_vgpr71_vgpr72_vgpr73
- ; CHECK-NEXT: $m0 = S_MOV_B32 3
- ; CHECK-NEXT: $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr74, implicit $vgpr75, implicit $vgpr76, implicit $vgpr77, implicit $vgpr78, implicit $vgpr79, implicit $vgpr88, implicit $vgpr89, implicit $vgpr90, implicit $vgpr91, implicit $vgpr92, implicit $vgpr93, implicit $vgpr94, implicit $vgpr95 :: ("amdgpu-thread-private" load (s1024) from %stack.1, align 4, addrspace 5)
- ; CHECK-NEXT: $m0 = S_MOV_B32 7
- ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ; W32-LABEL: name: reg_tuples
+ ; W32: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: $m0 = S_MOV_B32 7
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec_lo, 32, 256
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec_lo, 32, 288
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec_lo, 32, 320
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr43
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr45
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W32-NEXT: $m0 = S_MOV_B32 3
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.1, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr72, 32, $exec_lo, 32, 0
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr73, 32, $exec_lo, 32, 32
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr74
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr75
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr76
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr77
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr78
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr79
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr88
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr89
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr90
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr91
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr92
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr93
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr94
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr95
+ ; W32-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42, implicit-def $vgpr70_vgpr71_vgpr72_vgpr73
+ ; W32-NEXT: $m0 = S_MOV_B32 3
+ ; W32-NEXT: $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr74, implicit $vgpr75, implicit $vgpr76, implicit $vgpr77, implicit $vgpr78, implicit $vgpr79, implicit $vgpr88, implicit $vgpr89, implicit $vgpr90, implicit $vgpr91, implicit $vgpr92, implicit $vgpr93, implicit $vgpr94, implicit $vgpr95 :: ("amdgpu-thread-private" load (s1024) from %stack.1, align 4, addrspace 5)
+ ; W32-NEXT: $m0 = S_MOV_B32 7
+ ; W32-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ;
+ ; W64-LABEL: name: reg_tuples
+ ; W64: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: $m0 = S_MOV_B32 7
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 512
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr41, 32, $exec, 64, 576
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 640
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr43
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr45
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W64-NEXT: $m0 = S_MOV_B32 3
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.1, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr72, 32, $exec, 64, 0
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr73, 32, $exec, 64, 64
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr74
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr75
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr76
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr77
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr78
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr79
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr88
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr89
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr90
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr91
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr92
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr93
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr94
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr95
+ ; W64-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42, implicit-def $vgpr70_vgpr71_vgpr72_vgpr73
+ ; W64-NEXT: $m0 = S_MOV_B32 3
+ ; W64-NEXT: $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr74, implicit $vgpr75, implicit $vgpr76, implicit $vgpr77, implicit $vgpr78, implicit $vgpr79, implicit $vgpr88, implicit $vgpr89, implicit $vgpr90, implicit $vgpr91, implicit $vgpr92, implicit $vgpr93, implicit $vgpr94, implicit $vgpr95 :: ("amdgpu-thread-private" load (s1024) from %stack.1, align 4, addrspace 5)
+ ; W64-NEXT: $m0 = S_MOV_B32 7
+ ; W64-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42, implicit-def $vgpr70_vgpr71_vgpr72_vgpr73
S_SETPC_B64_return $sgpr30_sgpr31
...
@@ -147,17 +423,61 @@ stack:
body: |
bb.0:
liveins: $sgpr30_sgpr31, $vgpr48
- ; CHECK-LABEL: name: locals
- ; CHECK: liveins: $vgpr48, $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $m0 = S_MOV_B32 1
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.2, align 4, addrspace 5)
- ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
- ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40
- ; CHECK-NEXT: $m0 = S_MOV_B32 1
- ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.2, align 4, addrspace 5)
- ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ; W32-LABEL: name: locals
+ ; W32: liveins: $vgpr48, $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: $m0 = S_MOV_B32 1
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.2, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec_lo, 32, 0
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr41
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr42
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr43
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr45
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W32-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; W32-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; W32-NEXT: S_NOP 0, implicit-def $vgpr40
+ ; W32-NEXT: $m0 = S_MOV_B32 1
+ ; W32-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.2, align 4, addrspace 5)
+ ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ;
+ ; W64-LABEL: name: locals
+ ; W64: liveins: $vgpr48, $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: $m0 = S_MOV_B32 1
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.2, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 0
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr41
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr42
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr43
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr45
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W64-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; W64-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; W64-NEXT: S_NOP 0, implicit-def $vgpr40
+ ; W64-NEXT: $m0 = S_MOV_B32 1
+ ; W64-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: ("amdgpu-thread-private" load (s1024) from %stack.2, align 4, addrspace 5)
+ ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
S_NOP 0, implicit-def $vgpr40
@@ -191,7 +511,24 @@ body: |
; W32-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; W32-NEXT: $m0 = S_MOV_B32 9
; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.4, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec_lo, 32, 0
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr41
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr42
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec_lo, 32, 96
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr45
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
; W32-NEXT: $vgpr44 = SI_SPILL_S32_TO_VGPR $sgpr48, 0, $vgpr44
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr48, $vgpr191, 0, 32
; W32-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr43, implicit-def $sgpr22, implicit-def $sgpr48, implicit-def $m0, implicit-def $exec
; W32-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40
; W32-NEXT: $sgpr48 = SI_RESTORE_S32_FROM_VGPR $vgpr44, 0
@@ -214,7 +551,24 @@ body: |
; W64-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
; W64-NEXT: $m0 = S_MOV_B32 9
; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.4, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr40, 32, $exec, 64, 0
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr41
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr42
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 192
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr45
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
; W64-NEXT: $vgpr44 = SI_SPILL_S32_TO_VGPR $sgpr48, 0, $vgpr44
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr48, $vgpr191, 0, 32
; W64-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr43, implicit-def $sgpr22, implicit-def $sgpr48, implicit-def $m0, implicit-def $exec
; W64-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40
; W64-NEXT: $sgpr48 = SI_RESTORE_S32_FROM_VGPR $vgpr44, 0
@@ -257,29 +611,85 @@ tracksRegLiveness: true
machineFunctionInfo:
stackPtrOffsetReg: $sgpr32
body: |
- ; CHECK-LABEL: name: multiple_basic_blocks
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: liveins: $vgpr44, $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $m0 = S_MOV_B32 11
- ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
- ; CHECK-NEXT: S_BRANCH %bb.1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.2(0x80000000)
- ; CHECK-NEXT: liveins: $vgpr44, $sgpr30_sgpr31
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr43, implicit $vgpr44
- ; CHECK-NEXT: S_BRANCH %bb.2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $m0 = S_MOV_B32 11
- ; CHECK-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ; W32-LABEL: name: multiple_basic_blocks
+ ; W32: bb.0:
+ ; W32-NEXT: successors: %bb.1(0x80000000)
+ ; W32-NEXT: liveins: $vgpr44, $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: $m0 = S_MOV_B32 11
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec_lo, 32, 0
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec_lo, 32, 32
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec_lo, 32, 96
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr72
+ ; W32-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr73
+ ; W32-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
+ ; W32-NEXT: S_BRANCH %bb.1
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: bb.1:
+ ; W32-NEXT: successors: %bb.2(0x80000000)
+ ; W32-NEXT: liveins: $vgpr44, $sgpr30_sgpr31
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: S_NOP 0, implicit-def $vgpr43, implicit $vgpr44
+ ; W32-NEXT: S_BRANCH %bb.2
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: bb.2:
+ ; W32-NEXT: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: $m0 = S_MOV_B32 11
+ ; W32-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ;
+ ; W64-LABEL: name: multiple_basic_blocks
+ ; W64: bb.0:
+ ; W64-NEXT: successors: %bb.1(0x80000000)
+ ; W64-NEXT: liveins: $vgpr44, $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: $m0 = S_MOV_B32 11
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: ("amdgpu-thread-private" store (s1024) into %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr42, 32, $exec, 64, 0
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr43, 32, $exec, 64, 64
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr44
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_offset $vgpr45, 32, $exec, 64, 192
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr46
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr47
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr56
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr57
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr58
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr59
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr60
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr61
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr62
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr63
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr72
+ ; W64-NEXT: frame-setup CFI_INSTRUCTION same_value $vgpr73
+ ; W64-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
+ ; W64-NEXT: S_BRANCH %bb.1
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: bb.1:
+ ; W64-NEXT: successors: %bb.2(0x80000000)
+ ; W64-NEXT: liveins: $vgpr44, $sgpr30_sgpr31
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: S_NOP 0, implicit-def $vgpr43, implicit $vgpr44
+ ; W64-NEXT: S_BRANCH %bb.2
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: bb.2:
+ ; W64-NEXT: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: $m0 = S_MOV_B32 11
+ ; W64-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: ("amdgpu-thread-private" load (s1024) from %stack.0, align 4, addrspace 5)
+ ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
bb.0:
liveins: $sgpr30_sgpr31, $vgpr44
S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 6fefed6e07f2d..db33f43e65034 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -17,6 +17,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: s_mov_b64 exec, -1
; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
; GFX906-NEXT: s_mov_b64 exec, s[18:19]
+; GFX906-NEXT: v_writelane_b32 v41, s16, 4
+; GFX906-NEXT: v_writelane_b32 v41, s34, 2
+; GFX906-NEXT: v_writelane_b32 v41, s35, 3
+; GFX906-NEXT: s_addk_i32 s32, 0x2800
+; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX906-NEXT: v_writelane_b32 v41, s30, 0
+; GFX906-NEXT: v_writelane_b32 v41, s31, 1
; GFX906-NEXT: s_mov_b32 s21, s15
; GFX906-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
; GFX906-NEXT: s_mov_b32 s22, s14
@@ -30,17 +37,10 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: v_writelane_b32 v39, s26, 4
; GFX906-NEXT: v_writelane_b32 v39, s27, 5
; GFX906-NEXT: v_writelane_b32 v39, s8, 6
-; GFX906-NEXT: v_writelane_b32 v41, s16, 4
; GFX906-NEXT: v_writelane_b32 v39, s9, 7
-; GFX906-NEXT: v_writelane_b32 v41, s34, 2
; GFX906-NEXT: v_writelane_b32 v39, s6, 8
-; GFX906-NEXT: v_writelane_b32 v41, s35, 3
; GFX906-NEXT: v_writelane_b32 v39, s7, 9
-; GFX906-NEXT: v_writelane_b32 v41, s30, 0
; GFX906-NEXT: v_writelane_b32 v39, s4, 10
-; GFX906-NEXT: s_addk_i32 s32, 0x2800
-; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX906-NEXT: v_writelane_b32 v41, s31, 1
; GFX906-NEXT: v_mov_b32_e32 v32, v31
; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX906-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index f02b895cc6e7d..9054ff97ddbca 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -11,8 +11,8 @@ define void @test_remat_s_getpc_b64() #0 {
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v2, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: ;;#ASMSTART
@@ -36,16 +36,15 @@ define void @test_remat_s_getpc_b64() #0 {
; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
@@ -66,20 +65,20 @@ define void @test_remat_s_getpc_b64() #0 {
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v2, s30, 0
+; GFX12-NEXT: v_writelane_b32 v2, s31, 1
; GFX12-NEXT: s_getpc_b64 s[0:1]
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_sext_i32_i16 s1, s1
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_writelane_b32 v2, s31, 1
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_getpc_b64 s[0:1]
+; GFX12-NEXT: v_readlane_b32 s30, v2, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_sext_i32_i16 s1, s1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_readlane_b32 s30, v2, 0
; GFX12-NEXT: v_readlane_b32 s31, v2, 1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 7fb73949fce57..9845527d0ce51 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -41,73 +41,140 @@ body: |
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr4, $vgpr255, 0, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr5, $vgpr255, 1, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr6, $vgpr255, 2, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr7, $vgpr255, 3, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr8, $vgpr255, 4, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr9, $vgpr255, 5, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr10, $vgpr255, 6, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr11, $vgpr255, 7, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr12, $vgpr255, 8, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr13, $vgpr255, 9, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr14, $vgpr255, 10, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr15, $vgpr255, 11, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr16, $vgpr255, 12, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr17, $vgpr255, 13, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr18, $vgpr255, 14, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr19, $vgpr255, 15, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr20, $vgpr255, 16, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr21, $vgpr255, 17, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr22, $vgpr255, 18, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr23, $vgpr255, 19, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr24, $vgpr255, 20, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr25, $vgpr255, 21, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr26, $vgpr255, 22, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr27, $vgpr255, 23, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr28, $vgpr255, 24, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr29, $vgpr255, 25, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 26, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr64, $vgpr255, 26, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 27, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr65, $vgpr255, 27, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 28, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr66, $vgpr255, 28, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 29, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr67, $vgpr255, 29, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 30, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr68, $vgpr255, 30, 32
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 31, $vgpr2
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr69, $vgpr255, 31, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 0, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr70, $vgpr254, 0, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 1, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr71, $vgpr254, 1, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 2, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr72, $vgpr254, 2, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 3, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr73, $vgpr254, 3, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 4, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr74, $vgpr254, 4, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 5, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr75, $vgpr254, 5, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 6, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr76, $vgpr254, 6, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 7, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr77, $vgpr254, 7, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 8, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr78, $vgpr254, 8, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 9, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr79, $vgpr254, 9, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 10, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr80, $vgpr254, 10, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 11, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr81, $vgpr254, 11, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 12, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr82, $vgpr254, 12, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 13, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr83, $vgpr254, 13, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 14, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr84, $vgpr254, 14, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 15, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr85, $vgpr254, 15, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 16, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr86, $vgpr254, 16, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 17, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr87, $vgpr254, 17, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 18, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr88, $vgpr254, 18, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 19, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr89, $vgpr254, 19, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 20, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr90, $vgpr254, 20, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 21, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr91, $vgpr254, 21, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 22, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr92, $vgpr254, 22, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 23, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr93, $vgpr254, 23, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 24, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr94, $vgpr254, 24, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 25, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr95, $vgpr254, 25, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 26, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr96, $vgpr254, 26, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 27, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr97, $vgpr254, 27, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 28, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr98, $vgpr254, 28, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 29, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr99, $vgpr254, 29, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 30, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr100, $vgpr254, 30, 32
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 31, $vgpr3
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr101, $vgpr254, 31, 32
; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 0, $vgpr4
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr102, $vgpr253, 0, 32
; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 1, $vgpr4
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr103, $vgpr253, 1, 32
; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr30, 2, $vgpr4, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr31, 3, $vgpr4, implicit $sgpr30_sgpr31
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $pc_reg, $vgpr253, 2, 32, $vgpr253, 3, 32
; GCN-NEXT: $sgpr22 = IMPLICIT_DEF
; GCN-NEXT: $vgpr5 = IMPLICIT_DEF
; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index 5ead5e768af5d..a747af428c1b5 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -14601,10 +14601,10 @@ define void @s_shuffle_v2i64_v8i64__15_2() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[8:23]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -14630,10 +14630,10 @@ define void @s_shuffle_v2i64_v8i64__15_2() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[8:23]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -14741,10 +14741,10 @@ define void @s_shuffle_v2i64_v8i64__15_4() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -14770,10 +14770,10 @@ define void @s_shuffle_v2i64_v8i64__15_4() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -14799,18 +14799,19 @@ define void @s_shuffle_v2i64_v8i64__15_4() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s30
; GFX942-NEXT: s_mov_b32 s9, s31
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
@@ -14835,10 +14836,10 @@ define void @s_shuffle_v2i64_v8i64__15_5() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -14864,10 +14865,10 @@ define void @s_shuffle_v2i64_v8i64__15_5() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -14999,18 +15000,19 @@ define void @s_shuffle_v2i64_v8i64__15_6() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s30
; GFX942-NEXT: s_mov_b32 s9, s31
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
@@ -15041,10 +15043,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -15082,10 +15084,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -15117,11 +15119,11 @@ define void @s_shuffle_v2i64_v8i64__15_7() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -16168,14 +16170,14 @@ define void @s_shuffle_v2i64_v8i64__12_0() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
-; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s16
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s17
@@ -16892,14 +16894,14 @@ define void @s_shuffle_v2i64_v8i64__12_1() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
-; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s18
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s19
@@ -17477,10 +17479,10 @@ define void @s_shuffle_v2i64_v8i64__9_2() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[8:23]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -17506,10 +17508,10 @@ define void @s_shuffle_v2i64_v8i64__9_2() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[8:23]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -17651,10 +17653,10 @@ define void @s_shuffle_v2i64_v8i64__11_2() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[8:23]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -17680,10 +17682,10 @@ define void @s_shuffle_v2i64_v8i64__11_2() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[8:23]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -17795,10 +17797,10 @@ define void @s_shuffle_v2i64_v8i64__13_2() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[8:23]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -17824,10 +17826,10 @@ define void @s_shuffle_v2i64_v8i64__13_2() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[8:23]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -18948,10 +18950,10 @@ define void @s_shuffle_v2i64_v8i64__9_4() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -18977,10 +18979,10 @@ define void @s_shuffle_v2i64_v8i64__9_4() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -19006,18 +19008,19 @@ define void @s_shuffle_v2i64_v8i64__9_4() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
@@ -19098,10 +19101,10 @@ define void @s_shuffle_v2i64_v8i64__11_4() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -19127,10 +19130,10 @@ define void @s_shuffle_v2i64_v8i64__11_4() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -19156,18 +19159,19 @@ define void @s_shuffle_v2i64_v8i64__11_4() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s22
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s23
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
@@ -19192,10 +19196,10 @@ define void @s_shuffle_v2i64_v8i64__12_4() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -19221,10 +19225,10 @@ define void @s_shuffle_v2i64_v8i64__12_4() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -19274,10 +19278,10 @@ define void @s_shuffle_v2i64_v8i64__13_4() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -19303,10 +19307,10 @@ define void @s_shuffle_v2i64_v8i64__13_4() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -19332,18 +19336,19 @@ define void @s_shuffle_v2i64_v8i64__13_4() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s26
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s27
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
@@ -19368,10 +19373,10 @@ define void @s_shuffle_v2i64_v8i64__14_4() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -19397,10 +19402,10 @@ define void @s_shuffle_v2i64_v8i64__14_4() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -19869,10 +19874,10 @@ define void @s_shuffle_v2i64_v8i64__9_5() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -19898,10 +19903,10 @@ define void @s_shuffle_v2i64_v8i64__9_5() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -20007,10 +20012,10 @@ define void @s_shuffle_v2i64_v8i64__11_5() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -20036,10 +20041,10 @@ define void @s_shuffle_v2i64_v8i64__11_5() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -20089,10 +20094,10 @@ define void @s_shuffle_v2i64_v8i64__12_5() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -20118,10 +20123,10 @@ define void @s_shuffle_v2i64_v8i64__12_5() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -20171,10 +20176,10 @@ define void @s_shuffle_v2i64_v8i64__13_5() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -20200,10 +20205,10 @@ define void @s_shuffle_v2i64_v8i64__13_5() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -20253,10 +20258,10 @@ define void @s_shuffle_v2i64_v8i64__14_5() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -20282,10 +20287,10 @@ define void @s_shuffle_v2i64_v8i64__14_5() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -20851,18 +20856,19 @@ define void @s_shuffle_v2i64_v8i64__9_6() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
-; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
@@ -21025,18 +21031,19 @@ define void @s_shuffle_v2i64_v8i64__11_6() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
-; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s22
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s23
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
@@ -21249,18 +21256,19 @@ define void @s_shuffle_v2i64_v8i64__13_6() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
-; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s26
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s27
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
@@ -21367,11 +21375,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -21836,10 +21844,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -21877,10 +21885,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -21912,11 +21920,11 @@ define void @s_shuffle_v2i64_v8i64__9_7() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -22011,10 +22019,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -22052,10 +22060,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -22087,11 +22095,11 @@ define void @s_shuffle_v2i64_v8i64__11_7() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -22236,10 +22244,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -22277,10 +22285,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -22312,11 +22320,11 @@ define void @s_shuffle_v2i64_v8i64__13_7() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -22431,11 +22439,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -23434,10 +23442,10 @@ define void @s_shuffle_v2i64_v8i64__4_9() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -23463,10 +23471,10 @@ define void @s_shuffle_v2i64_v8i64__4_9() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -23612,10 +23620,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -23653,10 +23661,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -23688,11 +23696,11 @@ define void @s_shuffle_v2i64_v8i64__6_9() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -24286,10 +24294,10 @@ define void @s_shuffle_v2i64_v8i64__4_10() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -24315,10 +24323,10 @@ define void @s_shuffle_v2i64_v8i64__4_10() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -24368,10 +24376,10 @@ define void @s_shuffle_v2i64_v8i64__5_10() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -24397,10 +24405,10 @@ define void @s_shuffle_v2i64_v8i64__5_10() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -24457,10 +24465,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -24498,10 +24506,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -24533,11 +24541,11 @@ define void @s_shuffle_v2i64_v8i64__6_10() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -24652,17 +24660,18 @@ define void @s_shuffle_v2i64_v8i64__7_10() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s15
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
-; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s20
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s21
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
@@ -25331,10 +25340,10 @@ define void @s_shuffle_v2i64_v8i64__4_11() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -25360,10 +25369,10 @@ define void @s_shuffle_v2i64_v8i64__4_11() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -25475,10 +25484,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -25516,10 +25525,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -25551,11 +25560,11 @@ define void @s_shuffle_v2i64_v8i64__6_11() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -26149,10 +26158,10 @@ define void @s_shuffle_v2i64_v8i64__4_12() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -26178,10 +26187,10 @@ define void @s_shuffle_v2i64_v8i64__4_12() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -26231,10 +26240,10 @@ define void @s_shuffle_v2i64_v8i64__5_12() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -26260,10 +26269,10 @@ define void @s_shuffle_v2i64_v8i64__5_12() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -26320,10 +26329,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -26361,10 +26370,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -26396,11 +26405,11 @@ define void @s_shuffle_v2i64_v8i64__6_12() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -26515,17 +26524,18 @@ define void @s_shuffle_v2i64_v8i64__7_12() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s15
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
-; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s24
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s25
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
@@ -26896,14 +26906,14 @@ define void @s_shuffle_v2i64_v8i64__1_13() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
-; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
@@ -27047,10 +27057,10 @@ define void @s_shuffle_v2i64_v8i64__4_13() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -27076,10 +27086,10 @@ define void @s_shuffle_v2i64_v8i64__4_13() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -27129,10 +27139,10 @@ define void @s_shuffle_v2i64_v8i64__5_13() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -27158,10 +27168,10 @@ define void @s_shuffle_v2i64_v8i64__5_13() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -27217,10 +27227,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -27258,10 +27268,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -27293,11 +27303,11 @@ define void @s_shuffle_v2i64_v8i64__6_13() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -28005,10 +28015,10 @@ define void @s_shuffle_v2i64_v8i64__4_14() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -28034,10 +28044,10 @@ define void @s_shuffle_v2i64_v8i64__4_14() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -28087,10 +28097,10 @@ define void @s_shuffle_v2i64_v8i64__5_14() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -28116,10 +28126,10 @@ define void @s_shuffle_v2i64_v8i64__5_14() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -28176,10 +28186,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -28217,10 +28227,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -28252,11 +28262,11 @@ define void @s_shuffle_v2i64_v8i64__6_14() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -28371,17 +28381,18 @@ define void @s_shuffle_v2i64_v8i64__7_14() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s15
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
-; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s28
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s29
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
@@ -28987,10 +28998,10 @@ define void @s_shuffle_v2i64_v8i64__4_15() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -29016,10 +29027,10 @@ define void @s_shuffle_v2i64_v8i64__4_15() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -29069,10 +29080,10 @@ define void @s_shuffle_v2i64_v8i64__5_15() #0 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_writelane_b32 v0, s30, 0
+; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s31, 1
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
@@ -29098,10 +29109,10 @@ define void @s_shuffle_v2i64_v8i64__5_15() #0 {
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
; GFX90A-NEXT: v_writelane_b32 v0, s30, 0
+; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s31, 1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
@@ -29159,10 +29170,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() #0 {
; GFX900-NEXT: v_writelane_b32 v0, s48, 4
; GFX900-NEXT: v_writelane_b32 v0, s49, 5
; GFX900-NEXT: v_writelane_b32 v0, s50, 6
+; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_writelane_b32 v0, s51, 7
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
@@ -29200,10 +29211,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() #0 {
; GFX90A-NEXT: v_writelane_b32 v0, s48, 4
; GFX90A-NEXT: v_writelane_b32 v0, s49, 5
; GFX90A-NEXT: v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_writelane_b32 v0, s51, 7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
@@ -29235,11 +29246,11 @@ define void @s_shuffle_v2i64_v8i64__6_15() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -29354,14 +29365,14 @@ define void @s_shuffle_v2i64_v8i64__7_15() #0 {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
-; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: ;;#ASMSTART
+; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s28, s14
; GFX942-NEXT: s_mov_b32 s29, s15
; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29]
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index ea67593d72761..9ebf4f57ed7d3 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -28,6 +28,7 @@ body: |
; SGPR_SPILLED-NEXT: {{ $}}
; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $pc_reg, $vgpr62, 0, 32, $vgpr62, 1, 32
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
@@ -93,6 +94,7 @@ body: |
; SGPR_SPILLED-NEXT: {{ $}}
; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $pc_reg, $vgpr62, 0, 32, $vgpr62, 1, 32
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
@@ -156,6 +158,7 @@ body: |
; SGPR_SPILLED-NEXT: {{ $}}
; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $pc_reg, $vgpr62, 0, 32, $vgpr62, 1, 32
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
index 1332e33b9f2d1..2f769d94f174d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
@@ -1,86 +1,25 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
+# CHECK-LABEL: name: empty_entry_block
+# CHECK: SI_SPILL_S32_TO_VGPR
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers
+# CHECK-NEXT: SI_SPILL_S32_TO_VGPR
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers
+# CHECK-NEXT: SI_SPILL_S32_TO_VGPR
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers
+# CHECK-NEXT: SI_SPILL_S32_TO_VGPR
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers
+
+# CHECK: SI_RESTORE_S32_FROM_VGPR
+# CHECK-NEXT: SI_RESTORE_S32_FROM_VGPR
+# CHECK-NEXT: SI_RESTORE_S32_FROM_VGPR
+# CHECK-NEXT: SI_RESTORE_S32_FROM_VGPR
---
name: empty_entry_block
body: |
- ; CHECK-LABEL: name: empty_entry_block
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: liveins: $sgpr34, $sgpr35, $sgpr36, $sgpr37, $vgpr63
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr34, 0, $vgpr63
- ; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr35, 1, $vgpr63
- ; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr36, 2, $vgpr63
- ; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr37, 3, $vgpr63
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: liveins: $vgpr63
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 renamable $vcc, -1, implicit-def dead $scc
- ; CHECK-NEXT: $sgpr34_sgpr35 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
- ; CHECK-NEXT: $sgpr37 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 3
- ; CHECK-NEXT: $sgpr36 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2
- ; CHECK-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 1
- ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
- ; CHECK-NEXT: S_ENDPGM 0
bb.0:
bb.1:
renamable $sgpr36_sgpr37 = S_XOR_B64 renamable $vcc, -1, implicit-def dead $scc
$sgpr34_sgpr35 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
S_ENDPGM 0
-...
----
-name: lane-vgpr-spill-ncd
-tracksRegLiveness: true
-
-stack:
- - { id: 0, type: spill-slot, size: 8, stack-id: sgpr-spill }
-machineFunctionInfo:
- hasSpilledSGPRs: true
- stackPtrOffsetReg: '$sgpr32'
-
-body: |
- ; CHECK-LABEL: name: lane-vgpr-spill-ncd
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.2(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_BRANCH %bb.2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR %0, 0, implicit-def $sgpr6_sgpr7
- ; CHECK-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR %0, 1
- ; CHECK-NEXT: $sgpr4 = S_MOV_B32 32
- ; CHECK-NEXT: $sgpr4_sgpr5 = S_LSHR_B64 $sgpr6_sgpr7, killed $sgpr4, implicit-def dead $scc
- ; CHECK-NEXT: $sgpr4 = COPY $sgpr4, implicit killed $sgpr4_sgpr5
- ; CHECK-NEXT: $sgpr5 = COPY $sgpr6, implicit killed $sgpr6_sgpr7
- ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr5
- ; CHECK-NEXT: $vgpr1 = COPY killed $sgpr4
- ; CHECK-NEXT: SI_RETURN implicit killed $vgpr0, implicit killed $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 0
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr5, 1, [[DEF]], implicit killed $sgpr4_sgpr5
- ; CHECK-NEXT: S_BRANCH %bb.1
- bb.0:
- S_BRANCH %bb.2
-
- bb.1:
- $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0)
- $sgpr4 = S_MOV_B32 32
- $sgpr4_sgpr5 = S_LSHR_B64 $sgpr6_sgpr7, killed $sgpr4, implicit-def dead $scc
- $sgpr4 = COPY $sgpr4, implicit killed $sgpr4_sgpr5
- $sgpr5 = COPY $sgpr6, implicit killed $sgpr6_sgpr7
- $vgpr0 = COPY killed $sgpr5
- $vgpr1 = COPY killed $sgpr4
- SI_RETURN implicit killed $vgpr0, implicit killed $vgpr1
-
- bb.2:
- $sgpr4_sgpr5 = S_MOV_B64 0
- SI_SPILL_S64_SAVE killed $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0)
- S_BRANCH %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index d9d2a99c3e02d..862ed39078fea 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -231,14 +231,14 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33
; GCN-NEXT: v_writelane_b32 v40, s4, 2
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32 at rel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -382,14 +382,15 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s4, 2
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
@@ -422,7 +423,6 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
; GCN-NEXT: v_mov_b32_e32 v28, 0
; GCN-NEXT: v_mov_b32_e32 v29, 0
; GCN-NEXT: v_mov_b32_e32 v30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -450,16 +450,16 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v42, s4, 2
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: v_writelane_b32 v42, s30, 0
+; GCN-NEXT: v_writelane_b32 v42, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: v_writelane_b32 v42, s30, 0
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v42, s31, 1
; GCN-NEXT: v_mov_b32_e32 v40, v1
; GCN-NEXT: v_mov_b32_e32 v41, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -603,6 +603,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; FIJI-NEXT: s_mov_b64 exec, s[18:19]
; FIJI-NEXT: v_writelane_b32 v40, s16, 18
+; FIJI-NEXT: s_addk_i32 s32, 0x400
; FIJI-NEXT: v_writelane_b32 v40, s34, 0
; FIJI-NEXT: v_writelane_b32 v40, s35, 1
; FIJI-NEXT: v_writelane_b32 v40, s36, 2
@@ -620,6 +621,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: v_writelane_b32 v40, s64, 14
; FIJI-NEXT: v_writelane_b32 v40, s65, 15
; FIJI-NEXT: v_writelane_b32 v40, s30, 16
+; FIJI-NEXT: v_writelane_b32 v40, s31, 17
; FIJI-NEXT: s_mov_b32 s50, s15
; FIJI-NEXT: s_mov_b32 s51, s14
; FIJI-NEXT: s_mov_b32 s52, s13
@@ -630,8 +632,6 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: s_mov_b64 s[48:49], s[4:5]
; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4
; FIJI-NEXT: s_mov_b64 s[54:55], exec
-; FIJI-NEXT: s_addk_i32 s32, 0x400
-; FIJI-NEXT: v_writelane_b32 v40, s31, 17
; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; FIJI-NEXT: v_readfirstlane_b32 s16, v0
; FIJI-NEXT: v_readfirstlane_b32 s17, v1
@@ -694,6 +694,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HAWAII-NEXT: s_mov_b64 exec, s[18:19]
; HAWAII-NEXT: v_writelane_b32 v40, s16, 18
+; HAWAII-NEXT: s_addk_i32 s32, 0x400
; HAWAII-NEXT: v_writelane_b32 v40, s34, 0
; HAWAII-NEXT: v_writelane_b32 v40, s35, 1
; HAWAII-NEXT: v_writelane_b32 v40, s36, 2
@@ -711,6 +712,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: v_writelane_b32 v40, s64, 14
; HAWAII-NEXT: v_writelane_b32 v40, s65, 15
; HAWAII-NEXT: v_writelane_b32 v40, s30, 16
+; HAWAII-NEXT: v_writelane_b32 v40, s31, 17
; HAWAII-NEXT: s_mov_b32 s50, s15
; HAWAII-NEXT: s_mov_b32 s51, s14
; HAWAII-NEXT: s_mov_b32 s52, s13
@@ -721,8 +723,6 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: s_mov_b64 s[48:49], s[4:5]
; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; HAWAII-NEXT: s_mov_b64 s[54:55], exec
-; HAWAII-NEXT: s_addk_i32 s32, 0x400
-; HAWAII-NEXT: v_writelane_b32 v40, s31, 17
; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; HAWAII-NEXT: v_readfirstlane_b32 s16, v0
; HAWAII-NEXT: v_readfirstlane_b32 s17, v1
@@ -785,6 +785,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s16, 18
+; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s34, 0
; GFX9-NEXT: v_writelane_b32 v40, s35, 1
; GFX9-NEXT: v_writelane_b32 v40, s36, 2
@@ -802,6 +803,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: v_writelane_b32 v40, s64, 14
; GFX9-NEXT: v_writelane_b32 v40, s65, 15
; GFX9-NEXT: v_writelane_b32 v40, s30, 16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 17
; GFX9-NEXT: s_mov_b32 s50, s15
; GFX9-NEXT: s_mov_b32 s51, s14
; GFX9-NEXT: s_mov_b32 s52, s13
@@ -812,8 +814,6 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5]
; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
; GFX9-NEXT: s_mov_b64 s[54:55], exec
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s31, 17
; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
; GFX9-NEXT: v_readfirstlane_b32 s17, v1
diff --git a/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir
index 24c631ce5e15f..7b3402494f39f 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir
@@ -16,10 +16,15 @@ body: |
; CHECK: liveins: $sgpr50, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $vgpr63, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr63
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr50, $vgpr63, 0, 32
; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr52, 1, $vgpr63
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr52, $vgpr63, 1, 32
; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr53, 2, $vgpr63
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr53, $vgpr63, 2, 32
; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr54, 3, $vgpr63
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr54, $vgpr63, 3, 32
; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr55, 4, $vgpr63
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr55, $vgpr63, 4, 32
; CHECK-NEXT: S_NOP 0, implicit $sgpr50
; CHECK-NEXT: $sgpr50 = S_MOV_B32 0
; CHECK-NEXT: S_NOP 0, implicit $sgpr52
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
index 85a615c3d8ae8..866ce8a0c0293 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
@@ -13,6 +13,7 @@ body: |
; CHECK: liveins: $sgpr50, $vgpr63
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr63
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr50, $vgpr63, 0, 32
; CHECK-NEXT: S_NOP 0, implicit $sgpr50
; CHECK-NEXT: $sgpr50 = S_MOV_B32 0
S_NOP 0, implicit $sgpr50
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
index bb47603647733..55b5f026c1a4a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
@@ -56,21 +56,37 @@ body: |
; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr64, $vgpr63, 0, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr65, $vgpr63, 1, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr66, $vgpr63, 2, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr67, 3, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr67, $vgpr63, 3, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr68, 4, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr68, $vgpr63, 4, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr69, 5, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr69, $vgpr63, 5, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr70, 6, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr70, $vgpr63, 6, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr71, 7, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr71, $vgpr63, 7, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr80, 8, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr80, $vgpr63, 8, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr81, 9, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr81, $vgpr63, 9, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr82, 10, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr82, $vgpr63, 10, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr83, 11, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr83, $vgpr63, 11, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr84, 12, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr84, $vgpr63, 12, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr85, 13, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr85, $vgpr63, 13, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr86, 14, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr86, $vgpr63, 14, 32
; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr87, 15, $vgpr63
+ ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr87, $vgpr63, 15, 32
; GCN-NEXT: S_NOP 0
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
index 1801ea0ba5c8e..d22bc573cc7ca 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
@@ -2,7 +2,7 @@
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+block-vgpr-csr < %s | FileCheck -check-prefixes=CHECK,GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+block-vgpr-csr < %s | FileCheck -check-prefixes=CHECK,DAGISEL %s
-define i32 @non_entry_func(i32 %x) {
+define i32 @non_entry_func(i32 %x) #0 {
; CHECK-LABEL: non_entry_func:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -14,26 +14,26 @@ define i32 @non_entry_func(i32 %x) {
; CHECK-NEXT: scratch_store_b32 off, v2, s32 offset:100 ; 4-byte Folded Spill
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-NEXT: s_mov_b32 exec_lo, s0
-; CHECK-NEXT: v_writelane_b32 v2, s48, 0
; CHECK-NEXT: s_mov_b32 m0, 0x110003
-; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: ; transferring at most v40 v41 v56 v60 ; 128-byte Folded Spill
; CHECK-NEXT: scratch_store_block off, v[40:71], s32 offset:4
; CHECK-NEXT: s_mov_b32 m0, 1
-; CHECK-NEXT: v_writelane_b32 v2, s49, 1
; CHECK-NEXT: ; transferring at most v120 ; 128-byte Folded Spill
; CHECK-NEXT: scratch_store_block off, v[120:151], s32
+; CHECK-NEXT: v_writelane_b32 v2, s48, 0
+; CHECK-NEXT: v_writelane_b32 v2, s49, 1
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: s_nop
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; transferring at most v120 ; 128-byte Folded Reload
; CHECK-NEXT: scratch_load_block v[120:151], off, s32
; CHECK-NEXT: s_mov_b32 m0, 0x110003
-; CHECK-NEXT: scratch_store_b32 off, v1, s32 offset:88
+; CHECK-NEXT: v_readlane_b32 s49, v2, 1
; CHECK-NEXT: ; transferring at most v40 v41 v56 v60 ; 128-byte Folded Reload
; CHECK-NEXT: scratch_load_block v[40:71], off, s32 offset:4
+; CHECK-NEXT: scratch_store_b32 off, v1, s32 offset:88
; CHECK-NEXT: v_mov_b32_e32 v0, v1
-; CHECK-NEXT: v_readlane_b32 s49, v2, 1
; CHECK-NEXT: v_readlane_b32 s48, v2, 0
; CHECK-NEXT: s_xor_saveexec_b32 s0, -1
; CHECK-NEXT: scratch_load_b32 v2, off, s32 offset:100 ; 4-byte Folded Reload
@@ -47,7 +47,7 @@ define i32 @non_entry_func(i32 %x) {
ret i32 %x
}
-define amdgpu_kernel void @entry_func(i32 %x) {
+define amdgpu_kernel void @entry_func(i32 %x) #0 {
; GISEL-LABEL: entry_func:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -93,3 +93,5 @@ define amdgpu_kernel void @entry_func(i32 %x) {
%res = call i32 @non_entry_func(i32 %x)
ret void
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
index e962d1bad9779..51c28688f76ee 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -enable-var-scope %s
-define void @spill_more_than_wavesize_csr_sgprs() {
+define void @spill_more_than_wavesize_csr_sgprs() #0 {
; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -100,7 +100,7 @@ define void @spill_more_than_wavesize_csr_sgprs() {
ret void
}
-define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
+define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() #0 {
; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs_with_stack_object:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -142,8 +142,8 @@ define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
; CHECK-NEXT: v_writelane_b32 v1, s99, 32
; CHECK-NEXT: v_writelane_b32 v1, s100, 33
; CHECK-NEXT: v_writelane_b32 v1, s101, 34
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_writelane_b32 v1, s102, 35
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
@@ -203,3 +203,5 @@ define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
,~{s99},~{s100},~{s101},~{s102}"()
ret void
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 540737672ed15..c975f3a9ba946 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -292,19 +292,19 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: v_writelane_b32 v40, s34, 3
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_add_i32 s32, s32, 0x30000
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4
-; GCN-NEXT: s_add_i32 s32, s32, 0x30000
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, extern_func at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, extern_func at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GCN-NEXT: s_waitcnt vmcnt(1)
@@ -453,7 +453,7 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
; GCN-NEXT: v_writelane_b32 v39, s4, 32
; GCN-NEXT: v_writelane_b32 v39, s34, 33
; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4
+; GCN-NEXT: s_addk_i32 s32, 0x6000
; GCN-NEXT: v_writelane_b32 v39, s39, 0
; GCN-NEXT: v_writelane_b32 v39, s48, 1
; GCN-NEXT: v_writelane_b32 v39, s49, 2
@@ -485,8 +485,8 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
; GCN-NEXT: v_writelane_b32 v39, s99, 28
; GCN-NEXT: v_writelane_b32 v39, s100, 29
; GCN-NEXT: v_writelane_b32 v39, s101, 30
-; GCN-NEXT: s_addk_i32 s32, 0x6000
; GCN-NEXT: v_writelane_b32 v39, s102, 31
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4
; GCN-NEXT: s_mov_b32 s32, s34
; GCN-NEXT: v_readlane_b32 s34, v39, 33
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -576,7 +576,7 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
; GCN-NEXT: v_writelane_b32 v39, s4, 32
; GCN-NEXT: v_writelane_b32 v39, s34, 33
; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4
+; GCN-NEXT: s_add_i32 s32, s32, 0x46000
; GCN-NEXT: v_writelane_b32 v39, s39, 0
; GCN-NEXT: v_writelane_b32 v39, s48, 1
; GCN-NEXT: v_writelane_b32 v39, s49, 2
@@ -608,9 +608,9 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
; GCN-NEXT: v_writelane_b32 v39, s99, 28
; GCN-NEXT: v_writelane_b32 v39, s100, 29
; GCN-NEXT: v_writelane_b32 v39, s101, 30
-; GCN-NEXT: v_mov_b32_e32 v1, 0x1080
-; GCN-NEXT: s_add_i32 s32, s32, 0x46000
; GCN-NEXT: v_writelane_b32 v39, s102, 31
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4
+; GCN-NEXT: v_mov_b32_e32 v1, 0x1080
; GCN-NEXT: s_mov_b32 s32, s34
; GCN-NEXT: v_readlane_b32 s34, v39, 33
; GCN-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index 3537dae64ffea..92fb6a3e0f03b 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -1314,13 +1314,13 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() #0 {
; WAVE32-OPT-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s16
; WAVE32-OPT-NEXT: v_writelane_b32 v32, s30, 0
+; WAVE32-OPT-NEXT: s_addk_i32 s32, 0x1200
+; WAVE32-OPT-NEXT: v_writelane_b32 v32, s31, 1
; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 42
; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, 17
-; WAVE32-OPT-NEXT: s_addk_i32 s32, 0x1200
-; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument at abs32@hi
; WAVE32-OPT-NEXT: s_mov_b32 s18, s32
+; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument at abs32@hi
; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument at abs32@lo
-; WAVE32-OPT-NEXT: v_writelane_b32 v32, s31, 1
; WAVE32-OPT-NEXT: s_lshr_b32 s19, s18, 5
; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[0:3], s33
; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1349,13 +1349,13 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() #0 {
; WAVE64-OPT-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
; WAVE64-OPT-NEXT: s_mov_b64 exec, s[16:17]
; WAVE64-OPT-NEXT: v_writelane_b32 v32, s30, 0
+; WAVE64-OPT-NEXT: s_addk_i32 s32, 0x2400
+; WAVE64-OPT-NEXT: v_writelane_b32 v32, s31, 1
; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 42
; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, 17
-; WAVE64-OPT-NEXT: s_addk_i32 s32, 0x2400
-; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument at abs32@hi
; WAVE64-OPT-NEXT: s_mov_b32 s18, s32
+; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument at abs32@hi
; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument at abs32@lo
-; WAVE64-OPT-NEXT: v_writelane_b32 v32, s31, 1
; WAVE64-OPT-NEXT: s_lshr_b32 s19, s18, 6
; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[0:3], s33
; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 05ea168c9ec7c..7d6121d464a7d 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -167,13 +167,13 @@ define void @outgoing_f16_arg(ptr %ptr) #0 {
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
-; GFX7-NEXT: flat_load_ushort v0, v[0:1]
; GFX7-NEXT: v_writelane_b32 v40, s16, 2
; GFX7-NEXT: v_writelane_b32 v40, s30, 0
-; GFX7-NEXT: s_mov_b32 s17, f16_user at abs32@hi
-; GFX7-NEXT: s_mov_b32 s16, f16_user at abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
+; GFX7-NEXT: flat_load_ushort v0, v[0:1]
+; GFX7-NEXT: s_mov_b32 s17, f16_user at abs32@hi
+; GFX7-NEXT: s_mov_b32 s16, f16_user at abs32@lo
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
; GFX7-NEXT: v_readlane_b32 s31, v40, 1
@@ -199,13 +199,13 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 {
; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
-; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: v_writelane_b32 v40, s16, 2
; GFX7-NEXT: v_writelane_b32 v40, s30, 0
-; GFX7-NEXT: s_mov_b32 s17, v2f16_user at abs32@hi
-; GFX7-NEXT: s_mov_b32 s16, v2f16_user at abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
+; GFX7-NEXT: flat_load_dword v0, v[0:1]
+; GFX7-NEXT: s_mov_b32 s17, v2f16_user at abs32@hi
+; GFX7-NEXT: s_mov_b32 s16, v2f16_user at abs32@lo
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
; GFX7-NEXT: v_readlane_b32 s31, v40, 1
@@ -232,13 +232,13 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v42, s16, 2
-; GFX7-NEXT: v_writelane_b32 v42, s30, 0
-; GFX7-NEXT: s_mov_b32 s17, f16_result at abs32@hi
-; GFX7-NEXT: s_mov_b32 s16, f16_result at abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: v_writelane_b32 v42, s30, 0
; GFX7-NEXT: v_writelane_b32 v42, s31, 1
+; GFX7-NEXT: s_mov_b32 s17, f16_result at abs32@hi
+; GFX7-NEXT: s_mov_b32 s16, f16_result at abs32@lo
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -270,13 +270,13 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v42, s16, 2
-; GFX7-NEXT: v_writelane_b32 v42, s30, 0
-; GFX7-NEXT: s_mov_b32 s17, v2f16_result at abs32@hi
-; GFX7-NEXT: s_mov_b32 s16, v2f16_result at abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: v_writelane_b32 v42, s30, 0
; GFX7-NEXT: v_writelane_b32 v42, s31, 1
+; GFX7-NEXT: s_mov_b32 s17, v2f16_result at abs32@hi
+; GFX7-NEXT: s_mov_b32 s16, v2f16_result at abs32@lo
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -308,13 +308,13 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v42, s16, 2
-; GFX7-NEXT: v_writelane_b32 v42, s30, 0
-; GFX7-NEXT: s_mov_b32 s17, v4f16_result at abs32@hi
-; GFX7-NEXT: s_mov_b32 s16, v4f16_result at abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: v_writelane_b32 v42, s30, 0
; GFX7-NEXT: v_writelane_b32 v42, s31, 1
+; GFX7-NEXT: s_mov_b32 s17, v4f16_result at abs32@hi
+; GFX7-NEXT: s_mov_b32 s16, v4f16_result at abs32@lo
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -349,13 +349,13 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v42, s16, 2
-; GFX7-NEXT: v_writelane_b32 v42, s30, 0
-; GFX7-NEXT: s_mov_b32 s17, v8f16_result at abs32@hi
-; GFX7-NEXT: s_mov_b32 s16, v8f16_result at abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: v_writelane_b32 v42, s30, 0
; GFX7-NEXT: v_writelane_b32 v42, s31, 1
+; GFX7-NEXT: s_mov_b32 s17, v8f16_result at abs32@hi
+; GFX7-NEXT: s_mov_b32 s16, v8f16_result at abs32@lo
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -397,10 +397,10 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
; GFX7-NEXT: s_mov_b64 exec, s[18:19]
; GFX7-NEXT: v_writelane_b32 v40, s16, 2
; GFX7-NEXT: v_writelane_b32 v40, s30, 0
-; GFX7-NEXT: s_mov_b32 s17, v8f16_result at abs32@hi
-; GFX7-NEXT: s_mov_b32 s16, v8f16_result at abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
+; GFX7-NEXT: s_mov_b32 s17, v8f16_result at abs32@hi
+; GFX7-NEXT: s_mov_b32 s16, v8f16_result at abs32@lo
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
; GFX7-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
index 13cde61ff16a0..20443fd5574e7 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
@@ -10,16 +10,16 @@ define void @test_load_zext() #0 {
; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
; CHECK-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[2:3]
-; CHECK-NEXT: s_add_i32 s32, s32, 16
; CHECK-NEXT: v_writelane_b32 v40, s0, 2
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: s_add_i32 s32, s32, 16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_getpc_b64 s[0:1]
; CHECK-NEXT: s_add_u32 s0, s0, has_spgr_args at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s1, s1, has_spgr_args at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: s_mov_b32 s0, DescriptorBuffer at abs32@lo
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[2:3]
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll
index 8cd83979d26c0..1a317cf34b647 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll
@@ -19,9 +19,9 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v1, s30, 0
-; CHECK-NEXT: s_mov_b64 s[18:19], exec
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v1, s31, 1
+; CHECK-NEXT: s_mov_b64 s[18:19], exec
; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_readfirstlane_b32 s16, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s16, v0
@@ -61,6 +61,7 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 20
+; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s34, 0
; CHECK-NEXT: v_writelane_b32 v40, s35, 1
; CHECK-NEXT: v_writelane_b32 v40, s36, 2
@@ -74,18 +75,18 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
; CHECK-NEXT: v_writelane_b32 v40, s52, 10
; CHECK-NEXT: v_writelane_b32 v40, s53, 11
; CHECK-NEXT: v_writelane_b32 v40, s54, 12
-; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s55, 13
; CHECK-NEXT: v_writelane_b32 v40, s64, 14
+; CHECK-NEXT: v_writelane_b32 v40, s65, 15
+; CHECK-NEXT: v_writelane_b32 v40, s66, 16
+; CHECK-NEXT: v_writelane_b32 v40, s67, 17
+; CHECK-NEXT: v_writelane_b32 v40, s30, 18
+; CHECK-NEXT: v_writelane_b32 v40, s31, 19
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, constant at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, constant at rel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v40, s65, 15
; CHECK-NEXT: s_load_dwordx2 s[64:65], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v40, s66, 16
-; CHECK-NEXT: v_writelane_b32 v40, s67, 17
-; CHECK-NEXT: v_writelane_b32 v40, s30, 18
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
; CHECK-NEXT: s_mov_b32 s52, s13
@@ -94,7 +95,6 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[54:55], exec
-; CHECK-NEXT: v_writelane_b32 v40, s31, 19
; CHECK-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_readfirstlane_b32 s16, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s16, v0
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index c4f381c27eef8..24b4b79101ea0 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -13,14 +13,14 @@ define internal fastcc void @widget() #0 {
; GFX90A-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[18:19]
-; GFX90A-NEXT: s_addk_i32 s32, 0x400
; GFX90A-NEXT: v_writelane_b32 v40, s16, 2
+; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
+; GFX90A-NEXT: s_addk_i32 s32, 0x400
+; GFX90A-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-NEXT: s_getpc_b64 s[16:17]
; GFX90A-NEXT: s_add_u32 s16, s16, wobble at gotpcrel32@lo+4
; GFX90A-NEXT: s_addc_u32 s17, s17, wobble at gotpcrel32@hi+12
; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
-; GFX90A-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17]
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index e5215fe1acdef..0c87ed574a1e6 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -646,25 +646,23 @@ define i32 @s_in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
; GCN-NEXT: s_or_saveexec_b32 s16, -1
; GCN-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b32 exec_lo, s16
+; GCN-NEXT: v_writelane_b32 v40, s2, 4
; GCN-NEXT: s_add_i32 s32, s32, 16
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s30, 2
+; GCN-NEXT: v_writelane_b32 v40, s31, 3
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s2, 4
-; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: v_writelane_b32 v40, s34, 0
-; GCN-NEXT: s_mov_b32 s34, s1
-; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_and_b32 s35, s0, s3
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s34, s1
; GCN-NEXT: v_mov_b32_e32 v0, s35
-; GCN-NEXT: v_writelane_b32 v40, s30, 2
-; GCN-NEXT: v_writelane_b32 v40, s31, 3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_xor_b32 s0, s35, s34
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_readlane_b32 s30, v40, 2
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_readlane_b32 s31, v40, 3
@@ -694,25 +692,23 @@ define i32 @s_in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
; GCN-NEXT: s_or_saveexec_b32 s16, -1
; GCN-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b32 exec_lo, s16
+; GCN-NEXT: v_writelane_b32 v40, s2, 4
; GCN-NEXT: s_add_i32 s32, s32, 16
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s30, 2
+; GCN-NEXT: v_writelane_b32 v40, s31, 3
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
-; GCN-NEXT: v_writelane_b32 v40, s2, 4
-; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_xor_b32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_writelane_b32 v40, s34, 0
; GCN-NEXT: s_mov_b32 s34, s1
-; GCN-NEXT: v_writelane_b32 v40, s35, 1
; GCN-NEXT: s_and_b32 s35, s0, s3
-; GCN-NEXT: v_writelane_b32 v40, s30, 2
-; GCN-NEXT: v_writelane_b32 v40, s31, 3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_xor_b32 s0, s35, s34
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_readlane_b32 s30, v40, 2
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_readlane_b32 s31, v40, 3
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index a81d9a458e23a..a82453ee23ee9 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -8,10 +8,6 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
; CHECK-LABEL: eliminate_spill_after_mfma_rewrite:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v5
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v4
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -60,6 +56,11 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
; CHECK-NEXT: buffer_store_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword a63, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v5
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v4
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
+; CHECK-NEXT: s_nop 1
; CHECK-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[32:63], v[0:31]
@@ -212,10 +213,6 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
; CHECK-LABEL: eliminate_spill_after_mfma_rewrite_x2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v5
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v4
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -264,6 +261,11 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
; CHECK-NEXT: buffer_store_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword a63, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v5
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v4
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
+; CHECK-NEXT: s_nop 1
; CHECK-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[32:63], v[0:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 13457b1e2f254..dfef942759c09 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -3745,13 +3745,23 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_mul_v16i64:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
+; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse
+; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[0:1], v0, v17, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[48:49], s[0:1], v2, v19, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[52:53], s[0:1], v4, v21, 0
@@ -3759,14 +3769,6 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[42:43], s[0:1], v8, v25, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[46:47], s[0:1], v10, v27, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[58:59], s[0:1], v12, v29, 0
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v0, v16, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[38:39], s[0:1], v2, v18, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[50:51], s[0:1], v4, v20, 0
@@ -3802,8 +3804,6 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v12, v8, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v12, v18, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v18, v[6:7]
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX9-GISEL-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[60:61], s[0:1], v14, v30, 0
; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v3, v8
; GFX9-GISEL-NEXT: v_add_u32_e32 v27, v33, v4
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 573cdedb523d5..0f67d80e97104 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -298,45 +298,96 @@ define <8 x half> @baz() nounwind {
; CHECK-NEXT: scratch_store_b32 off, v93, s33 offset:404 ; 4-byte Folded Spill
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-NEXT: s_mov_b32 exec_lo, s1
+; CHECK-NEXT: v_writelane_b32 v93, s0, 14
+; CHECK-NEXT: s_addk_co_i32 s32, 0x1a0
; CHECK-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v40, s33 offset:144
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v41, s33 offset:140
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v42, s33 offset:136
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v43, s33 offset:132
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v44, s33 offset:128
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v45, s33 offset:124
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v46, s33 offset:120
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v47, s33 offset:116
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v56, s33 offset:112
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v57, s33 offset:108
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v58, s33 offset:104
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v59, s33 offset:100
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v60, s33 offset:96
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v61, s33 offset:92
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v62, s33 offset:88
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v63, s33 offset:84
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v72, s33 offset:80
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v73, s33 offset:76
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v74, s33 offset:72
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v75, s33 offset:68
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v76, s33 offset:64
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v77, s33 offset:60
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v78, s33 offset:56
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v79, s33 offset:52
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v88, s33 offset:48
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v89, s33 offset:44
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v90, s33 offset:40
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v91, s33 offset:36
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v92, s33 offset:32
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v104, s33 offset:28
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v105, s33 offset:24
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v106, s33 offset:20
; CHECK-NEXT: s_clause 0x4 ; 20-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v107, s33 offset:16
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v108, s33 offset:12
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v109, s33 offset:8
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v110, s33 offset:4
+; CHECK-NEXT: ; meta instruction
; CHECK-NEXT: scratch_store_b32 off, v111, s33
+; CHECK-NEXT: v_writelane_b32 v93, s34, 0
+; CHECK-NEXT: v_writelane_b32 v93, s35, 1
+; CHECK-NEXT: v_writelane_b32 v93, s36, 2
+; CHECK-NEXT: v_writelane_b32 v93, s37, 3
+; CHECK-NEXT: v_writelane_b32 v93, s38, 4
+; CHECK-NEXT: v_writelane_b32 v93, s39, 5
+; CHECK-NEXT: v_writelane_b32 v93, s48, 6
+; CHECK-NEXT: v_writelane_b32 v93, s49, 7
+; CHECK-NEXT: v_writelane_b32 v93, s50, 8
+; CHECK-NEXT: v_writelane_b32 v93, s51, 9
+; CHECK-NEXT: v_writelane_b32 v93, s52, 10
+; CHECK-NEXT: v_writelane_b32 v93, s53, 11
+; CHECK-NEXT: v_writelane_b32 v93, s30, 12
+; CHECK-NEXT: v_writelane_b32 v93, s31, 13
; CHECK-NEXT: v_dual_mov_b32 v92, v31 :: v_dual_mov_b32 v1, 0
; CHECK-NEXT: v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v3, 0
; CHECK-NEXT: v_dual_mov_b32 v2, 0x50 :: v_dual_mov_b32 v5, 0
@@ -363,14 +414,21 @@ define <8 x half> @baz() nounwind {
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_b128 v[76:79], v[4:5], off
; CHECK-NEXT: global_load_b128 v[88:91], v[6:7], off
-; CHECK-NEXT: v_writelane_b32 v93, s0, 14
-; CHECK-NEXT: s_addk_co_i32 s32, 0x1a0
; CHECK-NEXT: s_getpc_b64 s[0:1]
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-NEXT: s_sext_i32_i16 s1, s1
; CHECK-NEXT: s_add_co_u32 s0, s0, foo at gotpcrel32@lo+12
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-NEXT: s_add_co_ci_u32 s1, s1, foo at gotpcrel32@hi+24
+; CHECK-NEXT: s_mov_b32 s50, s15
+; CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; CHECK-NEXT: s_mov_b32 s51, s14
+; CHECK-NEXT: s_mov_b32 s52, s13
+; CHECK-NEXT: s_mov_b32 s53, s12
+; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_wait_loadcnt 0x5
; CHECK-NEXT: scratch_store_b128 off, v[10:13], s33 offset:148 ; 16-byte Folded Spill
; CHECK-NEXT: s_wait_loadcnt 0x4
@@ -390,8 +448,6 @@ define <8 x half> @baz() nounwind {
; CHECK-NEXT: global_load_b128 v[18:21], v[8:9], off offset:80
; CHECK-NEXT: global_load_b128 v[22:25], v[8:9], off offset:96
; CHECK-NEXT: global_load_b128 v[26:29], v[8:9], off offset:112
-; CHECK-NEXT: v_writelane_b32 v93, s34, 0
-; CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; CHECK-NEXT: s_wait_loadcnt 0x4
; CHECK-NEXT: scratch_store_b128 off, v[10:13], s33 offset:276 ; 16-byte Folded Spill
; CHECK-NEXT: s_wait_loadcnt 0x3
@@ -406,27 +462,6 @@ define <8 x half> @baz() nounwind {
; CHECK-NEXT: scratch_store_b128 off, v[30:33], s33 offset:356
; CHECK-NEXT: scratch_store_b128 off, v[34:37], s33 offset:372
; CHECK-NEXT: scratch_store_b128 off, v[38:41], s33 offset:388
-; CHECK-NEXT: v_writelane_b32 v93, s35, 1
-; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT: v_writelane_b32 v93, s36, 2
-; CHECK-NEXT: v_writelane_b32 v93, s37, 3
-; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT: v_writelane_b32 v93, s38, 4
-; CHECK-NEXT: v_writelane_b32 v93, s39, 5
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT: v_writelane_b32 v93, s48, 6
-; CHECK-NEXT: v_writelane_b32 v93, s49, 7
-; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT: v_writelane_b32 v93, s50, 8
-; CHECK-NEXT: s_mov_b32 s50, s15
-; CHECK-NEXT: v_writelane_b32 v93, s51, 9
-; CHECK-NEXT: s_mov_b32 s51, s14
-; CHECK-NEXT: v_writelane_b32 v93, s52, 10
-; CHECK-NEXT: s_mov_b32 s52, s13
-; CHECK-NEXT: v_writelane_b32 v93, s53, 11
-; CHECK-NEXT: s_mov_b32 s53, s12
-; CHECK-NEXT: v_writelane_b32 v93, s30, 12
-; CHECK-NEXT: v_writelane_b32 v93, s31, 13
; CHECK-NEXT: s_wait_kmcnt 0x0
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 580ef1522ee14..1ac385d123204 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -16,15 +16,19 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: v_writelane_b32 v44, s4, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x800
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v44, s30, 0
+; GFX9-NEXT: v_writelane_b32 v44, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v36, v16
; GFX9-NEXT: v_mov_b32_e32 v35, v15
; GFX9-NEXT: v_mov_b32_e32 v34, v14
; GFX9-NEXT: v_mov_b32_e32 v33, v13
; GFX9-NEXT: v_mov_b32_e32 v32, v12
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: ;;#ASMSTART
@@ -34,14 +38,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1
-; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: v_writelane_b32 v44, s4, 2
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_writelane_b32 v44, s30, 0
-; GFX9-NEXT: v_writelane_b32 v44, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, v40
@@ -72,15 +72,19 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s5
+; GFX10-NEXT: v_writelane_b32 v44, s4, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x400
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v44, s30, 0
+; GFX10-NEXT: v_writelane_b32 v44, s31, 1
; GFX10-NEXT: v_mov_b32_e32 v36, v16
; GFX10-NEXT: v_mov_b32_e32 v35, v15
; GFX10-NEXT: v_mov_b32_e32 v34, v14
; GFX10-NEXT: v_mov_b32_e32 v33, v13
; GFX10-NEXT: v_mov_b32_e32 v32, v12
-; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
@@ -90,14 +94,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT: s_addk_i32 s32, 0x400
-; GFX10-NEXT: v_writelane_b32 v44, s4, 2
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_writelane_b32 v44, s30, 0
-; GFX10-NEXT: v_writelane_b32 v44, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v0, v40
@@ -129,14 +130,21 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
-; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
-; GFX11-NEXT: v_mov_b32_e32 v32, v12
+; GFX11-NEXT: v_writelane_b32 v44, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s33
+; GFX11-NEXT: v_writelane_b32 v44, s30, 0
+; GFX11-NEXT: v_writelane_b32 v44, s31, 1
+; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
+; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
+; GFX11-NEXT: v_mov_b32_e32 v32, v12
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
@@ -146,14 +154,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_add_i32 s32, s32, 32
-; GFX11-NEXT: v_writelane_b32 v44, s0, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v44, s30, 0
-; GFX11-NEXT: v_writelane_b32 v44, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, v40 :: v_dual_mov_b32 v1, v41
@@ -206,25 +210,25 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: v_writelane_b32 v45, s4, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: v_writelane_b32 v45, s30, 0
+; GFX9-NEXT: v_writelane_b32 v45, s31, 1
; GFX9-NEXT: v_mov_b32_e32 v44, v16
; GFX9-NEXT: v_mov_b32_e32 v43, v15
; GFX9-NEXT: v_mov_b32_e32 v42, v14
; GFX9-NEXT: v_mov_b32_e32 v41, v13
; GFX9-NEXT: v_mov_b32_e32 v40, v12
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1
-; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: v_writelane_b32 v45, s4, 2
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_writelane_b32 v45, s30, 0
-; GFX9-NEXT: v_writelane_b32 v45, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -256,25 +260,26 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mov_b32 exec_lo, s5
+; GFX10-NEXT: v_writelane_b32 v45, s4, 2
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v45, s30, 0
+; GFX10-NEXT: v_writelane_b32 v45, s31, 1
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT: s_addk_i32 s32, 0x400
-; GFX10-NEXT: v_writelane_b32 v45, s4, 2
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v40, v16
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v41, v15
-; GFX10-NEXT: v_writelane_b32 v45, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v42, v14
; GFX10-NEXT: v_mov_b32_e32 v43, v13
; GFX10-NEXT: v_mov_b32_e32 v44, v12
-; GFX10-NEXT: v_writelane_b32 v45, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -306,24 +311,28 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: v_writelane_b32 v45, s0, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_clause 0x4 ; 20-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4
+; GFX11-NEXT: ; meta instruction
; GFX11-NEXT: scratch_store_b32 off, v44, s33
+; GFX11-NEXT: v_writelane_b32 v45, s30, 0
+; GFX11-NEXT: v_writelane_b32 v45, s31, 1
; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_add_i32 s32, s32, 32
-; GFX11-NEXT: v_writelane_b32 v45, s0, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v40, v16 :: v_dual_mov_b32 v41, v15
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_writelane_b32 v45, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v42, v14 :: v_dual_mov_b32 v43, v13
; GFX11-NEXT: v_mov_b32_e32 v44, v12
-; GFX11-NEXT: v_writelane_b32 v45, s31, 1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,5 +373,5 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immar
attributes #0 = { nounwind writeonly }
attributes #1 = { nounwind readonly }
-attributes #2 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #2 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" nounwind }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 2e29d7f215686..013b402a82488 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -3075,14 +3075,14 @@ define void @callee_no_stack_with_call() #1 {
; GFX1032-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX1032-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX1032-NEXT: s_mov_b32 exec_lo, s17
-; GFX1032-NEXT: s_addk_i32 s32, 0x200
; GFX1032-NEXT: v_writelane_b32 v40, s16, 2
+; GFX1032-NEXT: s_addk_i32 s32, 0x200
+; GFX1032-NEXT: v_writelane_b32 v40, s30, 0
+; GFX1032-NEXT: v_writelane_b32 v40, s31, 1
; GFX1032-NEXT: s_getpc_b64 s[16:17]
; GFX1032-NEXT: s_add_u32 s16, s16, external_void_func_void at gotpcrel32@lo+4
; GFX1032-NEXT: s_addc_u32 s17, s17, external_void_func_void at gotpcrel32@hi+12
; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX1032-NEXT: v_writelane_b32 v40, s30, 0
-; GFX1032-NEXT: v_writelane_b32 v40, s31, 1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-NEXT: v_readlane_b32 s30, v40, 0
@@ -3106,14 +3106,14 @@ define void @callee_no_stack_with_call() #1 {
; GFX1064-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX1064-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX1064-NEXT: s_mov_b64 exec, s[18:19]
-; GFX1064-NEXT: s_addk_i32 s32, 0x400
; GFX1064-NEXT: v_writelane_b32 v40, s16, 2
+; GFX1064-NEXT: s_addk_i32 s32, 0x400
+; GFX1064-NEXT: v_writelane_b32 v40, s30, 0
+; GFX1064-NEXT: v_writelane_b32 v40, s31, 1
; GFX1064-NEXT: s_getpc_b64 s[16:17]
; GFX1064-NEXT: s_add_u32 s16, s16, external_void_func_void at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s17, s17, external_void_func_void at gotpcrel32@hi+12
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX1064-NEXT: v_writelane_b32 v40, s30, 0
-; GFX1064-NEXT: v_writelane_b32 v40, s31, 1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index fa9b09ea73c93..4c7898006212b 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -369,10 +369,10 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) #0 {
; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16
; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0
; DAGISEL-NEXT: ;;#ASMSTART
; DAGISEL-NEXT: ; clobber CSR
; DAGISEL-NEXT: ;;#ASMEND
-; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0
; DAGISEL-NEXT: ;;#ASMSTART
; DAGISEL-NEXT: ; clobber non-CSR
; DAGISEL-NEXT: ;;#ASMEND
@@ -408,10 +408,10 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) #0 {
; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16
; GISEL-NEXT: s_mov_b32 exec_lo, -1
; GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL-NEXT: v_writelane_b32 v2, s20, 0
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; clobber CSR
; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_writelane_b32 v2, s20, 0
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; clobber non-CSR
; GISEL-NEXT: ;;#ASMEND
@@ -447,10 +447,10 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) #0 {
; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16
; DAGISEL64-NEXT: s_mov_b64 exec, -1
; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0
; DAGISEL64-NEXT: ;;#ASMSTART
; DAGISEL64-NEXT: ; clobber CSR
; DAGISEL64-NEXT: ;;#ASMEND
-; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0
; DAGISEL64-NEXT: ;;#ASMSTART
; DAGISEL64-NEXT: ; clobber non-CSR
; DAGISEL64-NEXT: ;;#ASMEND
@@ -487,10 +487,10 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) #0 {
; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16
; GISEL64-NEXT: s_mov_b64 exec, -1
; GISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL64-NEXT: v_writelane_b32 v2, s20, 0
; GISEL64-NEXT: ;;#ASMSTART
; GISEL64-NEXT: ; clobber CSR
; GISEL64-NEXT: ;;#ASMEND
-; GISEL64-NEXT: v_writelane_b32 v2, s20, 0
; GISEL64-NEXT: ;;#ASMSTART
; GISEL64-NEXT: ; clobber non-CSR
; GISEL64-NEXT: ;;#ASMEND
@@ -525,11 +525,11 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) #0 {
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 nv ; 4-byte Folded Spill
+; GFX1250-DAGISEL-NEXT: v_writelane_b32 v2, s20, 0
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: ;;#ASMSTART
; GFX1250-DAGISEL-NEXT: ; clobber CSR
; GFX1250-DAGISEL-NEXT: ;;#ASMEND
-; GFX1250-DAGISEL-NEXT: v_writelane_b32 v2, s20, 0
; GFX1250-DAGISEL-NEXT: ;;#ASMSTART
; GFX1250-DAGISEL-NEXT: ; clobber non-CSR
; GFX1250-DAGISEL-NEXT: ;;#ASMEND
@@ -1582,17 +1582,16 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: v_writelane_b32 v40, s0, 3
+; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL-NEXT: v_writelane_b32 v40, s4, 0
+; DAGISEL-NEXT: v_writelane_b32 v40, s30, 1
+; DAGISEL-NEXT: v_writelane_b32 v40, s31, 2
; DAGISEL-NEXT: v_mov_b32_e32 v2, v0
; DAGISEL-NEXT: v_swap_b32 v0, v1
; DAGISEL-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
-; DAGISEL-NEXT: v_writelane_b32 v40, s4, 0
; DAGISEL-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
-; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250
-; DAGISEL-NEXT: v_writelane_b32 v40, s30, 1
-; DAGISEL-NEXT: v_writelane_b32 v40, s31, 2
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1
; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0
@@ -1918,17 +1917,16 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: v_writelane_b32 v40, s0, 3
+; GISEL-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL-NEXT: v_writelane_b32 v40, s4, 0
+; GISEL-NEXT: v_writelane_b32 v40, s30, 1
+; GISEL-NEXT: v_writelane_b32 v40, s31, 2
; GISEL-NEXT: v_mov_b32_e32 v2, v0
; GISEL-NEXT: v_swap_b32 v0, v1
; GISEL-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
-; GISEL-NEXT: v_writelane_b32 v40, s4, 0
; GISEL-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
-; GISEL-NEXT: s_addk_co_i32 s32, 0x250
-; GISEL-NEXT: v_writelane_b32 v40, s30, 1
-; GISEL-NEXT: v_writelane_b32 v40, s31, 2
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_readlane_b32 s30, v40, 1
; GISEL-NEXT: v_readlane_b32 s31, v40, 2
; GISEL-NEXT: v_readlane_b32 s4, v40, 0
@@ -2254,18 +2252,17 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; DAGISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL64-NEXT: v_writelane_b32 v40, s0, 4
-; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0
-; DAGISEL64-NEXT: v_swap_b32 v0, v1
-; DAGISEL64-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
-; DAGISEL64-NEXT: v_writelane_b32 v40, s4, 0
-; DAGISEL64-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL64-NEXT: v_writelane_b32 v40, s4, 0
; DAGISEL64-NEXT: v_writelane_b32 v40, s5, 1
; DAGISEL64-NEXT: v_writelane_b32 v40, s30, 2
; DAGISEL64-NEXT: v_writelane_b32 v40, s31, 3
+; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL64-NEXT: v_swap_b32 v0, v1
+; DAGISEL64-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
+; DAGISEL64-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
; DAGISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2
; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3
; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1
@@ -2592,18 +2589,17 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL64-NEXT: v_writelane_b32 v40, s0, 4
-; GISEL64-NEXT: v_mov_b32_e32 v2, v0
-; GISEL64-NEXT: v_swap_b32 v0, v1
-; GISEL64-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
-; GISEL64-NEXT: v_writelane_b32 v40, s4, 0
-; GISEL64-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
; GISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL64-NEXT: v_writelane_b32 v40, s4, 0
; GISEL64-NEXT: v_writelane_b32 v40, s5, 1
; GISEL64-NEXT: v_writelane_b32 v40, s30, 2
; GISEL64-NEXT: v_writelane_b32 v40, s31, 3
+; GISEL64-NEXT: v_mov_b32_e32 v2, v0
+; GISEL64-NEXT: v_swap_b32 v0, v1
+; GISEL64-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
+; GISEL64-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
; GISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL64-NEXT: v_readlane_b32 s30, v40, 2
; GISEL64-NEXT: v_readlane_b32 s31, v40, 3
; GISEL64-NEXT: v_readlane_b32 s5, v40, 1
@@ -3710,15 +3706,15 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 nv ; 4-byte Folded Spill
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s0, 3
-; GFX1250-DAGISEL-NEXT: v_mov_b32_e32 v2, v0
-; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1
-; GFX1250-DAGISEL-NEXT: s_mov_b64 s[0:1], gfx_callee at abs64
-; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s4, 0
; GFX1250-DAGISEL-NEXT: s_addk_co_i32 s32, 0xe50
+; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s4, 0
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s30, 1
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s31, 2
+; GFX1250-DAGISEL-NEXT: v_mov_b32_e32 v2, v0
+; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1
+; GFX1250-DAGISEL-NEXT: s_mov_b64 s[0:1], gfx_callee at abs64
+; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
-; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v40, 1
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s4, v40, 0
@@ -8039,16 +8035,15 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; DAGISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: v_writelane_b32 v42, s0, 3
-; DAGISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
-; DAGISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250
-; DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
; DAGISEL-NEXT: v_writelane_b32 v42, s4, 0
; DAGISEL-NEXT: v_writelane_b32 v42, s30, 1
; DAGISEL-NEXT: v_writelane_b32 v42, s31, 2
+; DAGISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
+; DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
; DAGISEL-NEXT: flat_store_b32 v[40:41], v0
; DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
@@ -8381,16 +8376,15 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: v_writelane_b32 v42, s0, 3
-; GISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
-; GISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
; GISEL-NEXT: s_addk_co_i32 s32, 0x250
-; GISEL-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v41, v9
; GISEL-NEXT: v_writelane_b32 v42, s4, 0
; GISEL-NEXT: v_writelane_b32 v42, s30, 1
; GISEL-NEXT: v_writelane_b32 v42, s31, 2
+; GISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
+; GISEL-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v41, v9
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_readlane_b32 s30, v42, 1
; GISEL-NEXT: flat_store_b32 v[40:41], v0
; GISEL-NEXT: v_readlane_b32 s31, v42, 2
@@ -8723,18 +8717,17 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; DAGISEL64-NEXT: scratch_store_b32 off, v41, s33 offset:168
; DAGISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL64-NEXT: v_writelane_b32 v42, s0, 4
-; DAGISEL64-NEXT: s_mov_b32 s1, callee at abs32@hi
-; DAGISEL64-NEXT: s_mov_b32 s0, callee at abs32@lo
; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250
-; DAGISEL64-NEXT: v_mov_b32_e32 v41, v9
; DAGISEL64-NEXT: v_writelane_b32 v42, s4, 0
-; DAGISEL64-NEXT: v_mov_b32_e32 v40, v8
; DAGISEL64-NEXT: v_writelane_b32 v42, s5, 1
; DAGISEL64-NEXT: v_writelane_b32 v42, s30, 2
; DAGISEL64-NEXT: v_writelane_b32 v42, s31, 3
+; DAGISEL64-NEXT: s_mov_b32 s1, callee at abs32@hi
+; DAGISEL64-NEXT: s_mov_b32 s0, callee at abs32@lo
+; DAGISEL64-NEXT: v_mov_b32_e32 v41, v9
+; DAGISEL64-NEXT: v_mov_b32_e32 v40, v8
; DAGISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL64-NEXT: v_readlane_b32 s30, v42, 2
; DAGISEL64-NEXT: flat_store_b32 v[40:41], v0
; DAGISEL64-NEXT: v_readlane_b32 s31, v42, 3
@@ -9068,18 +9061,17 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GISEL64-NEXT: scratch_store_b32 off, v41, s33 offset:168
; GISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL64-NEXT: v_writelane_b32 v42, s0, 4
-; GISEL64-NEXT: s_mov_b32 s0, callee at abs32@lo
-; GISEL64-NEXT: s_mov_b32 s1, callee at abs32@hi
; GISEL64-NEXT: s_addk_co_i32 s32, 0x250
-; GISEL64-NEXT: v_mov_b32_e32 v40, v8
; GISEL64-NEXT: v_writelane_b32 v42, s4, 0
-; GISEL64-NEXT: v_mov_b32_e32 v41, v9
; GISEL64-NEXT: v_writelane_b32 v42, s5, 1
; GISEL64-NEXT: v_writelane_b32 v42, s30, 2
; GISEL64-NEXT: v_writelane_b32 v42, s31, 3
+; GISEL64-NEXT: s_mov_b32 s0, callee at abs32@lo
+; GISEL64-NEXT: s_mov_b32 s1, callee at abs32@hi
+; GISEL64-NEXT: v_mov_b32_e32 v40, v8
+; GISEL64-NEXT: v_mov_b32_e32 v41, v9
; GISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL64-NEXT: v_readlane_b32 s30, v42, 2
; GISEL64-NEXT: flat_store_b32 v[40:41], v0
; GISEL64-NEXT: v_readlane_b32 s31, v42, 3
@@ -10193,15 +10185,14 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168 nv
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x2
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v42, s0, 3
-; GFX1250-DAGISEL-NEXT: s_mov_b64 s[0:1], callee at abs64
; GFX1250-DAGISEL-NEXT: s_addk_co_i32 s32, 0xe50
-; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v42, s4, 0
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v42, s30, 1
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v42, s31, 2
+; GFX1250-DAGISEL-NEXT: s_mov_b64 s[0:1], callee at abs64
+; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
-; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
; GFX1250-DAGISEL-NEXT: flat_store_b32 v[40:41], v0
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index 4011d0990d5ab..27035e3c46125 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -22,9 +22,9 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: v_writelane_b32 v40, s28, 2
; GFX90A-NEXT: v_writelane_b32 v40, s29, 3
; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
-; GFX90A-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
; GFX90A-NEXT: s_addk_i32 s32, 0x400
; GFX90A-NEXT: v_writelane_b32 v40, s31, 1
+; GFX90A-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s20
; GFX90A-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 5009f0249df6d..991a1024bd86a 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -27,9 +27,9 @@ define void @test() #0 {
; GCN-NEXT: v_writelane_b32 v40, s28, 2
; GCN-NEXT: v_writelane_b32 v40, s29, 3
; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s16
; GCN-NEXT: ;;#ASMEND
More information about the llvm-branch-commits
mailing list