[llvm] b25b4c0 - [AMDGPU] Separate out SGPR spills to VGPR lanes during PEI
Christudasan Devadasan via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 16 22:20:03 PST 2022
Author: Christudasan Devadasan
Date: 2022-12-17T11:49:41+05:30
New Revision: b25b4c0ab4ad1acc1490c7560970a2e80cf94b3e
URL: https://github.com/llvm/llvm-project/commit/b25b4c0ab4ad1acc1490c7560970a2e80cf94b3e
DIFF: https://github.com/llvm/llvm-project/commit/b25b4c0ab4ad1acc1490c7560970a2e80cf94b3e.diff
LOG: [AMDGPU] Separate out SGPR spills to VGPR lanes during PEI
SILowerSGPRSpills pass handles the lowering of SGPR spills
into VGPR lanes. Some SGPR spills are handled later during
PEI. There is a common function used in both places to find
the free VGPR lane. This patch eliminates that dependency to
find the free VGPR by handling it separately for PEI. It is a
prerequisite patch for a future work to allow SGPR spills to
virtual VGPR lanes during SILowerSGPRSpills.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D124195
Added:
Modified:
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
llvm/test/CodeGen/AMDGPU/bf16.ll
llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
llvm/test/CodeGen/AMDGPU/indirect-call.ll
llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
llvm/test/CodeGen/AMDGPU/nested-calls.ll
llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
llvm/test/CodeGen/AMDGPU/save-fp.ll
llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
llvm/test/CodeGen/AMDGPU/sibling-call.ll
llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
llvm/test/CodeGen/AMDGPU/stack-realign.ll
llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
llvm/test/CodeGen/AMDGPU/wave32.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7c222396fb79..bd32ef7bc9f0 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -69,25 +69,7 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
// We need to save and restore the current FP/BP.
- // 1: If there is already a VGPR with free lanes, use it. We
- // may already have to pay the penalty for spilling a CSR VGPR.
- if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
- int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
- TargetStackID::SGPRSpill);
-
- if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
- llvm_unreachable("allocate SGPR spill should have worked");
-
- FrameIndex = NewFI;
-
- LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
- dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to "
- << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
- << '\n');
- return;
- }
-
- // 2: Next, try to save the FP/BP in an unused SGPR.
+ // 1: Try to save the FP/BP in an unused SGPR.
TempSGPR = findScratchNonCalleeSaveRegister(
MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
@@ -95,21 +77,20 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
TargetStackID::SGPRSpill);
- if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
- // 3: There's no free lane to spill, and no free register to save FP/BP,
+ if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPRLane(
+ MF, NewFI, /* IsPrologEpilog */ true)) {
+ // 2: There's no free lane to spill, and no free register to save FP/BP,
// so we're forced to spill another VGPR to use for the spill.
- auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
- MFI->allocateWWMSpill(MF, Spill.VGPR);
-
FrameIndex = NewFI;
LLVM_DEBUG(
+ auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(NewFI).front();
dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
} else {
// Remove dead <NewFI> index
MF.getFrameInfo().RemoveStackObject(NewFI);
- // 4: If all else fails, spill the FP/BP to memory.
+ // 3: If all else fails, spill the FP/BP to memory.
FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling "
<< (IsFP ? "FP" : "BP") << '\n');
@@ -822,7 +803,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(FI);
+ FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
assert(Spill.size() == 1);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
@@ -1020,7 +1001,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(FI);
+ FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
assert(Spill.size() == 1);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg)
.addReg(Spill[0].VGPR)
@@ -1266,13 +1247,6 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
}
- for (MachineBasicBlock &MBB : MF) {
- for (auto &Reg : MFI->getWWMSpills())
- MBB.addLiveIn(Reg.first);
-
- MBB.sortUniqueLiveIns();
- }
-
// Ignore the SGPRs the default implementation found.
SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
@@ -1318,6 +1292,14 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
MFI->BasePointerSaveIndex, false);
}
+
+ // Mark all lane VGPRs as BB LiveIns.
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto &Reg : MFI->getWWMSpills())
+ MBB.addLiveIn(Reg.first);
+
+ MBB.sortUniqueLiveIns();
+ }
}
void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 336e2a7d0da0..05eba7c26145 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -296,7 +296,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
- if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+ if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
NewReservedRegs = true;
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 16171d202004..e1e987040228 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -290,21 +290,67 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
return false;
}
-/// \p returns true if \p NumLanes slots are available in VGPRs already used for
-/// SGPR spilling.
-//
-// FIXME: This only works after processFunctionBeforeFrameFinalized
-bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF,
- unsigned NumNeed) const {
+bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
+ int FI,
+ unsigned LaneIndex) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- unsigned WaveSize = ST.getWavefrontSize();
- return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register LaneVGPR;
+ if (!LaneIndex) {
+ LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+ if (LaneVGPR == AMDGPU::NoRegister) {
+ // We have no VGPRs left for spilling SGPRs. Reset because we will not
+ // partially spill the SGPR to VGPRs.
+ SGPRSpillToVGPRLanes.erase(FI);
+ return false;
+ }
+
+ SpillVGPRs.push_back(LaneVGPR);
+ // Add this register as live-in to all blocks to avoid machine verifier
+ // complaining about use of an undefined physical register.
+ for (MachineBasicBlock &BB : MF)
+ BB.addLiveIn(LaneVGPR);
+ } else {
+ LaneVGPR = SpillVGPRs.back();
+ }
+
+ SGPRSpillToVGPRLanes[FI].push_back(
+ SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
+ return true;
}
-/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
-bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
- int FI) {
- std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
+bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
+ MachineFunction &MF, int FI, unsigned LaneIndex) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register LaneVGPR;
+ if (!LaneIndex) {
+ LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+ if (LaneVGPR == AMDGPU::NoRegister) {
+ // We have no VGPRs left for spilling SGPRs. Reset because we will not
+ // partially spill the SGPR to VGPRs.
+ PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
+ return false;
+ }
+
+ allocateWWMSpill(MF, LaneVGPR);
+ } else {
+ LaneVGPR = WWMSpills.back().first;
+ }
+
+ PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
+ SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
+ return true;
+}
+
+bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
+ int FI,
+ bool IsPrologEpilog) {
+ std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
+ IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
+ : SGPRSpillToVGPRLanes[FI];
// This has already been allocated.
if (!SpillLanes.empty())
@@ -313,7 +359,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned WaveSize = ST.getWavefrontSize();
unsigned Size = FrameInfo.getObjectSize(FI);
@@ -325,42 +370,20 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
assert(Size >= 4 && "invalid sgpr spill size");
assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
- // Make sure to handle the case where a wide SGPR spill may span between two
- // VGPRs.
- for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
- Register LaneVGPR;
- unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
-
- if (VGPRIndex == 0) {
- LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
- if (LaneVGPR == AMDGPU::NoRegister) {
- // We have no VGPRs left for spilling SGPRs. Reset because we will not
- // partially spill the SGPR to VGPRs.
- SGPRToVGPRSpills.erase(FI);
- NumVGPRSpillLanes -= I;
-
- // FIXME: We can run out of free registers with split allocation if
- // IPRA is enabled and a called function already uses every VGPR.
-#if 0
- DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(),
- "VGPRs for SGPR spilling",
- 0, DS_Error);
- MF.getFunction().getContext().diagnose(DiagOutOfRegs);
-#endif
- return false;
- }
+ unsigned &NumSpillLanes =
+ IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
- SpillVGPRs.push_back(LaneVGPR);
+ for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
+ unsigned LaneIndex = (NumSpillLanes % WaveSize);
- // Add this register as live-in to all blocks to avoid machine verifier
- // complaining about use of an undefined physical register.
- for (MachineBasicBlock &BB : MF)
- BB.addLiveIn(LaneVGPR);
- } else {
- LaneVGPR = SpillVGPRs.back();
+ bool Allocated =
+ IsPrologEpilog
+ ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
+ : allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
+ if (!Allocated) {
+ NumSpillLanes -= I;
+ return false;
}
-
- SpillLanes.push_back(SIRegisterInfo::SpilledReg(LaneVGPR, VGPRIndex));
}
return true;
@@ -437,16 +460,14 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
bool SIMachineFunctionInfo::removeDeadFrameIndices(
MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
- // Remove dead frame indices from function frame, however keep FP & BP since
- // spills for them haven't been inserted yet. And also make sure to remove the
- // frame indices from `SGPRToVGPRSpills` data structure, otherwise, it could
- // result in an unexpected side effect and bug, in case of any re-mapping of
- // freed frame indices by later pass(es) like "stack slot coloring".
- for (auto &R : make_early_inc_range(SGPRToVGPRSpills)) {
- if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) {
- MFI.RemoveStackObject(R.first);
- SGPRToVGPRSpills.erase(R.first);
- }
+ // Remove dead frame indices from function frame. And also make sure to remove
+ // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
+ // could result in an unexpected side effect and bug, in case of any
+ // re-mapping of freed frame indices by later pass(es) like "stack slot
+ // coloring".
+ for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
+ MFI.RemoveStackObject(R.first);
+ SGPRSpillToVGPRLanes.erase(R.first);
}
bool HaveSGPRToMemory = false;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 6d0606402445..9b4a1d7bf843 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -438,10 +438,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
};
private:
- // Track VGPR + wave index for each subregister of the SGPR spilled to
- // frameindex key.
- DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRToVGPRSpills;
+ // To track VGPR + lane index for each subregister of the SGPR spilled to
+ // frameindex key during SILowerSGPRSpills pass.
+ DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRSpillToVGPRLanes;
+ // To track VGPR + lane index for spilling special SGPRs like Frame Pointer
+ // identified during PrologEpilogInserter.
+ DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
+ PrologEpilogSGPRSpillToVGPRLanes;
unsigned NumVGPRSpillLanes = 0;
+ unsigned NumVGPRPrologEpilogSpillLanes = 0;
SmallVector<Register, 2> SpillVGPRs;
using WWMSpillsMap = MapVector<Register, int>;
// To track the registers used in instructions that can potentially modify the
@@ -474,6 +479,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
private:
Register VGPRForAGPRCopy;
+ bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI,
+ unsigned LaneIndex);
+ bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI,
+ unsigned LaneIndex);
+
public:
Register getVGPRForAGPRCopy() const {
return VGPRForAGPRCopy;
@@ -517,9 +527,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
}
ArrayRef<SIRegisterInfo::SpilledReg>
- getSGPRToVGPRSpills(int FrameIndex) const {
- auto I = SGPRToVGPRSpills.find(FrameIndex);
- return (I == SGPRToVGPRSpills.end())
+ getSGPRSpillToVGPRLanes(int FrameIndex) const {
+ auto I = SGPRSpillToVGPRLanes.find(FrameIndex);
+ return (I == SGPRSpillToVGPRLanes.end())
? ArrayRef<SIRegisterInfo::SpilledReg>()
: makeArrayRef(I->second);
}
@@ -528,6 +538,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
+ ArrayRef<SIRegisterInfo::SpilledReg>
+ getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const {
+ auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex);
+ return (I == PrologEpilogSGPRSpillToVGPRLanes.end())
+ ? ArrayRef<SIRegisterInfo::SpilledReg>()
+ : makeArrayRef(I->second);
+ }
+
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4,
Align Alignment = Align(4));
@@ -551,9 +569,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
I->second.IsDead = true;
}
- bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
- unsigned NumLane) const;
- bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
+ bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI,
+ bool IsPrologEpilog = false);
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
/// If \p ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 87553f128327..2005f81e8cde 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1705,7 +1705,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
LiveIntervals *LIS, bool OnlyToVGPR) const {
SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index);
+ ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
@@ -1822,7 +1822,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
LiveIntervals *LIS, bool OnlyToVGPR) const {
SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index);
+ ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index ec5c224cb1b2..c2deb26f707f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -9,8 +9,9 @@ define ptr addrspace(1) @call_assert_align() {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
-; CHECK-NEXT: v_writelane_b32 v40, s33, 2
+; CHECK-NEXT: v_writelane_b32 v41, s33, 0
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
@@ -27,9 +28,10 @@ define ptr addrspace(1) @call_assert_align() {
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
-; CHECK-NEXT: v_readlane_b32 s33, v40, 2
+; CHECK-NEXT: v_readlane_b32 s33, v41, 0
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 7ed11c7abbb6..12994b4030e3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -235,8 +235,9 @@ define void @func_caller_stack() {
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
-; MUBUF-NEXT: v_writelane_b32 v40, s33, 2
+; MUBUF-NEXT: v_writelane_b32 v41, s33, 0
; MUBUF-NEXT: s_mov_b32 s33, s32
; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
@@ -256,9 +257,10 @@ define void @func_caller_stack() {
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
-; MUBUF-NEXT: v_readlane_b32 s33, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s33, v41, 0
; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -268,8 +270,9 @@ define void @func_caller_stack() {
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2
+; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0
; FLATSCR-NEXT: s_mov_b32 s33, s32
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
@@ -289,9 +292,10 @@ define void @func_caller_stack() {
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
-; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s33, v41, 0
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword v41, off, s32 offset:4 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
@@ -305,10 +309,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
-; MUBUF-NEXT: v_writelane_b32 v40, s33, 2
+; MUBUF-NEXT: v_writelane_b32 v41, s33, 0
; MUBUF-NEXT: s_mov_b32 s33, s32
; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
@@ -373,9 +378,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
-; MUBUF-NEXT: v_readlane_b32 s33, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s33, v41, 0
; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -385,9 +391,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off
-; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2
+; FLATSCR-NEXT: v_writelane_b32 v41, s33, 0
; FLATSCR-NEXT: s_mov_b32 s33, s32
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
@@ -422,9 +429,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
-; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s33, v41, 0
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword v41, off, s32 offset:4 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index b5fa7de724bf..273975533bcd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -234,13 +234,14 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
@@ -248,9 +249,10 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 35557b32ae3a..61d2b410e549 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -19,8 +19,9 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FIXEDABI-NEXT: s_or_saveexec_b64 s[16:17], -1
; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; FIXEDABI-NEXT: s_mov_b64 exec, s[16:17]
-; FIXEDABI-NEXT: v_writelane_b32 v40, s33, 2
+; FIXEDABI-NEXT: v_writelane_b32 v41, s33, 0
; FIXEDABI-NEXT: s_mov_b32 s33, s32
; FIXEDABI-NEXT: s_addk_i32 s32, 0x400
; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0
@@ -32,9 +33,10 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00
-; FIXEDABI-NEXT: v_readlane_b32 s33, v40, 2
+; FIXEDABI-NEXT: v_readlane_b32 s33, v41, 0
; FIXEDABI-NEXT: s_or_saveexec_b64 s[4:5], -1
; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; FIXEDABI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; FIXEDABI-NEXT: s_mov_b64 exec, s[4:5]
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
; FIXEDABI-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 7ef295cf5e99..573613da596e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -172,7 +172,7 @@ attributes #0 = { nounwind }
; GCN-NEXT: .vgpr_count: 0x1{{$}}
; GCN-NEXT: no_stack_call:
; GCN-NEXT: .lds_size: 0{{$}}
-; GCN-NEXT: .sgpr_count: 0x24{{$}}
+; GCN-NEXT: .sgpr_count: 0x25{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT: .vgpr_count: 0x3{{$}}
; GCN-NEXT: no_stack_extern_call:
@@ -180,19 +180,19 @@ attributes #0 = { nounwind }
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT: .vgpr_count: 0x2b{{$}}
+; GCN-NEXT: .vgpr_count: 0x2c{{$}}
; GCN-NEXT: no_stack_extern_call_many_args:
; GCN-NEXT: .lds_size: 0{{$}}
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}}
-; GCN-NEXT: .vgpr_count: 0x2b{{$}}
+; GCN-NEXT: .vgpr_count: 0x2c{{$}}
; GCN-NEXT: no_stack_indirect_call:
; GCN-NEXT: .lds_size: 0{{$}}
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT: .vgpr_count: 0x2b{{$}}
+; GCN-NEXT: .vgpr_count: 0x2c{{$}}
; GCN-NEXT: simple_lds:
; GCN-NEXT: .lds_size: 0x100{{$}}
; GCN-NEXT: .sgpr_count: 0x20{{$}}
@@ -202,7 +202,7 @@ attributes #0 = { nounwind }
; GCN-NEXT: .lds_size: 0x100{{$}}
; GCN-NEXT: .sgpr_count: 0x26{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT: .vgpr_count: 0x29{{$}}
+; GCN-NEXT: .vgpr_count: 0x2a{{$}}
; GCN-NEXT: simple_stack:
; GCN-NEXT: .lds_size: 0{{$}}
; GCN-NEXT: .sgpr_count: 0x21{{$}}
@@ -210,7 +210,7 @@ attributes #0 = { nounwind }
; GCN-NEXT: .vgpr_count: 0x2{{$}}
; GCN-NEXT: simple_stack_call:
; GCN-NEXT: .lds_size: 0{{$}}
-; GCN-NEXT: .sgpr_count: 0x24{{$}}
+; GCN-NEXT: .sgpr_count: 0x25{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT: .vgpr_count: 0x4{{$}}
; GCN-NEXT: simple_stack_extern_call:
@@ -218,16 +218,16 @@ attributes #0 = { nounwind }
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT: .vgpr_count: 0x2b{{$}}
+; GCN-NEXT: .vgpr_count: 0x2c{{$}}
; GCN-NEXT: simple_stack_indirect_call:
; GCN-NEXT: .lds_size: 0{{$}}
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
-; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT: .vgpr_count: 0x2b{{$}}
+; GCN-NEXT: .stack_frame_size_in_bytes: 0x30{{$}}
+; GCN-NEXT: .vgpr_count: 0x2c{{$}}
; GCN-NEXT: simple_stack_recurse:
; GCN-NEXT: .lds_size: 0{{$}}
; GCN-NEXT: .sgpr_count: 0x26{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT: .vgpr_count: 0x2a{{$}}
+; GCN-NEXT: .vgpr_count: 0x2b{{$}}
; GCN-NEXT: ...
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 8a7ce09562fb..ce5bfe84269b 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -1420,10 +1420,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v2, s33, 2
+; GCN-NEXT: s_mov_b32 s8, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v2, s30, 0
; GCN-NEXT: v_writelane_b32 v2, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -1438,7 +1438,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: v_readlane_b32 s30, v2, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v2, 2
+; GCN-NEXT: s_mov_b32 s33, s8
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -1451,7 +1451,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v2, s33, 2
+; GFX7-NEXT: s_mov_b32 s8, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[4:5]
@@ -1468,7 +1468,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_readlane_b32 s31, v2, 1
; GFX7-NEXT: v_readlane_b32 s30, v2, 0
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: v_readlane_b32 s33, v2, 2
+; GFX7-NEXT: s_mov_b32 s33, s8
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -1481,7 +1481,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v2, s33, 2
+; GFX8-NEXT: s_mov_b32 s6, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[4:5]
@@ -1498,7 +1498,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: v_readlane_b32 s33, v2, 2
+; GFX8-NEXT: s_mov_b32 s33, s6
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -1511,7 +1511,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v2, s33, 2
+; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
@@ -1527,7 +1527,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v2, 2
+; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -1542,14 +1542,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v2, s33, 2
+; GFX10-NEXT: s_mov_b32 s6, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v2, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -1558,7 +1558,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v2, 2
+; GFX10-NEXT: s_mov_b32 s33, s6
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -1578,10 +1578,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v3, s33, 2
+; GCN-NEXT: s_mov_b32 s8, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v3, s30, 0
; GCN-NEXT: v_writelane_b32 v3, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -1600,7 +1600,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_readlane_b32 s31, v3, 1
; GCN-NEXT: v_readlane_b32 s30, v3, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v3, 2
+; GCN-NEXT: s_mov_b32 s33, s8
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -1613,7 +1613,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v3, s33, 2
+; GFX7-NEXT: s_mov_b32 s8, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[4:5]
@@ -1634,7 +1634,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_readlane_b32 s31, v3, 1
; GFX7-NEXT: v_readlane_b32 s30, v3, 0
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: v_readlane_b32 s33, v3, 2
+; GFX7-NEXT: s_mov_b32 s33, s8
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -1647,7 +1647,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v2, s33, 2
+; GFX8-NEXT: s_mov_b32 s6, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[4:5]
@@ -1663,7 +1663,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: v_readlane_b32 s33, v2, 2
+; GFX8-NEXT: s_mov_b32 s33, s6
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -1676,7 +1676,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v2, s33, 2
+; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
@@ -1692,7 +1692,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v2, 2
+; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -1707,14 +1707,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v2, s33, 2
+; GFX10-NEXT: s_mov_b32 s6, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v2, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -1723,7 +1723,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v2, 2
+; GFX10-NEXT: s_mov_b32 s33, s6
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -1743,10 +1743,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v4, s33, 2
+; GCN-NEXT: s_mov_b32 s8, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v4, s30, 0
; GCN-NEXT: v_writelane_b32 v4, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -1766,7 +1766,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_readlane_b32 s31, v4, 1
; GCN-NEXT: v_readlane_b32 s30, v4, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v4, 2
+; GCN-NEXT: s_mov_b32 s33, s8
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -1779,7 +1779,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v4, s33, 2
+; GFX7-NEXT: s_mov_b32 s8, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[4:5]
@@ -1801,7 +1801,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: v_readlane_b32 s33, v4, 2
+; GFX7-NEXT: s_mov_b32 s33, s8
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -1814,7 +1814,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v3, s33, 2
+; GFX8-NEXT: s_mov_b32 s6, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[4:5]
@@ -1834,7 +1834,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-NEXT: v_readlane_b32 s30, v3, 0
; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: v_readlane_b32 s33, v3, 2
+; GFX8-NEXT: s_mov_b32 s33, s6
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -1847,7 +1847,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v3, s33, 2
+; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
@@ -1869,7 +1869,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v3, 2
+; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -1884,7 +1884,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v3, s33, 2
+; GFX10-NEXT: s_mov_b32 s6, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
@@ -1905,7 +1905,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v3, 2
+; GFX10-NEXT: s_mov_b32 s33, s6
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -1925,10 +1925,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v5, s33, 2
+; GCN-NEXT: s_mov_b32 s8, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v5, s30, 0
; GCN-NEXT: v_writelane_b32 v5, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -1955,7 +1955,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_readlane_b32 s31, v5, 1
; GCN-NEXT: v_readlane_b32 s30, v5, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v5, 2
+; GCN-NEXT: s_mov_b32 s33, s8
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -1968,7 +1968,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v5, s33, 2
+; GFX7-NEXT: s_mov_b32 s8, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[4:5]
@@ -1997,7 +1997,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_readlane_b32 s31, v5, 1
; GFX7-NEXT: v_readlane_b32 s30, v5, 0
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: v_readlane_b32 s33, v5, 2
+; GFX7-NEXT: s_mov_b32 s33, s8
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -2010,7 +2010,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v3, s33, 2
+; GFX8-NEXT: s_mov_b32 s6, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[4:5]
@@ -2037,7 +2037,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-NEXT: v_readlane_b32 s30, v3, 0
; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: v_readlane_b32 s33, v3, 2
+; GFX8-NEXT: s_mov_b32 s33, s6
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -2050,7 +2050,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v3, s33, 2
+; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
@@ -2072,7 +2072,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v3, 2
+; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -2087,14 +2087,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v3, s33, 2
+; GFX10-NEXT: s_mov_b32 s6, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v3, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -2109,7 +2109,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v3, 2
+; GFX10-NEXT: s_mov_b32 s33, s6
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -2129,10 +2129,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v9, s33, 2
+; GCN-NEXT: s_mov_b32 s8, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v9, s30, 0
; GCN-NEXT: v_writelane_b32 v9, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -2175,7 +2175,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_readlane_b32 s31, v9, 1
; GCN-NEXT: v_readlane_b32 s30, v9, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v9, 2
+; GCN-NEXT: s_mov_b32 s33, s8
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -2188,7 +2188,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v9, s33, 2
+; GFX7-NEXT: s_mov_b32 s8, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[4:5]
@@ -2233,7 +2233,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_readlane_b32 s31, v9, 1
; GFX7-NEXT: v_readlane_b32 s30, v9, 0
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: v_readlane_b32 s33, v9, 2
+; GFX7-NEXT: s_mov_b32 s33, s8
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -2246,7 +2246,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v5, s33, 2
+; GFX8-NEXT: s_mov_b32 s6, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[4:5]
@@ -2287,7 +2287,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_readlane_b32 s31, v5, 1
; GFX8-NEXT: v_readlane_b32 s30, v5, 0
; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: v_readlane_b32 s33, v5, 2
+; GFX8-NEXT: s_mov_b32 s33, s6
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -2300,7 +2300,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v5, s33, 2
+; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
@@ -2330,7 +2330,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: v_readlane_b32 s31, v5, 1
; GFX9-NEXT: v_readlane_b32 s30, v5, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v5, 2
+; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -2345,14 +2345,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v5, s33, 2
+; GFX10-NEXT: s_mov_b32 s6, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v5, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v5, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -2375,7 +2375,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_readlane_b32 s31, v5, 1
; GFX10-NEXT: v_readlane_b32 s30, v5, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v5, 2
+; GFX10-NEXT: s_mov_b32 s33, s6
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -2395,10 +2395,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v17, s33, 2
+; GCN-NEXT: s_mov_b32 s8, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v17, s30, 0
; GCN-NEXT: v_writelane_b32 v17, s31, 1
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -2473,7 +2473,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_readlane_b32 s31, v17, 1
; GCN-NEXT: v_readlane_b32 s30, v17, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v17, 2
+; GCN-NEXT: s_mov_b32 s33, s8
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -2486,7 +2486,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v17, s33, 2
+; GFX7-NEXT: s_mov_b32 s8, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[4:5]
@@ -2563,7 +2563,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_readlane_b32 s31, v17, 1
; GFX7-NEXT: v_readlane_b32 s30, v17, 0
; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: v_readlane_b32 s33, v17, 2
+; GFX7-NEXT: s_mov_b32 s33, s8
; GFX7-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -2576,7 +2576,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v9, s33, 2
+; GFX8-NEXT: s_mov_b32 s6, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[4:5]
@@ -2645,7 +2645,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_readlane_b32 s31, v9, 1
; GFX8-NEXT: v_readlane_b32 s30, v9, 0
; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: v_readlane_b32 s33, v9, 2
+; GFX8-NEXT: s_mov_b32 s33, s6
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -2658,7 +2658,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v9, s33, 2
+; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
@@ -2704,7 +2704,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: v_readlane_b32 s31, v9, 1
; GFX9-NEXT: v_readlane_b32 s30, v9, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v9, 2
+; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -2719,14 +2719,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v9, s33, 2
+; GFX10-NEXT: s_mov_b32 s6, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v9, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v9, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -2765,7 +2765,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_readlane_b32 s31, v9, 1
; GFX10-NEXT: v_readlane_b32 s30, v9, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v9, 2
+; GFX10-NEXT: s_mov_b32 s33, s6
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index faa83251f16c..a24ba0555c7a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -14,16 +14,16 @@ define void @use_vcc() #1 {
}
; GCN-LABEL: {{^}}indirect_use_vcc:
-; GCN: v_writelane_b32 v40, s33, 2
+; GCN: v_writelane_b32 v41, s33, 0
; GCN: v_writelane_b32 v40, s30, 0
; GCN: v_writelane_b32 v40, s31, 1
; GCN: s_swappc_b64
; GCN: v_readlane_b32 s31, v40, 1
; GCN: v_readlane_b32 s30, v40, 0
-; GCN: v_readlane_b32 s33, v40, 2
+; GCN: v_readlane_b32 s33, v41, 0
; GCN: s_setpc_b64 s[30:31]
; GCN: ; NumSgprs: 36
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
define void @indirect_use_vcc() #1 {
call void @use_vcc()
ret void
@@ -34,7 +34,7 @@ define void @indirect_use_vcc() #1 {
; CI: ; NumSgprs: 38
; VI-NOBUG: ; NumSgprs: 40
; VI-BUG: ; NumSgprs: 96
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
call void @indirect_use_vcc()
ret void
@@ -52,7 +52,7 @@ define void @use_flat_scratch() #1 {
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
; CI: ; NumSgprs: 38
; VI: ; NumSgprs: 40
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
define void @indirect_use_flat_scratch() #1 {
call void @use_flat_scratch()
ret void
@@ -63,7 +63,7 @@ define void @indirect_use_flat_scratch() #1 {
; CI: ; NumSgprs: 38
; VI-NOBUG: ; NumSgprs: 40
; VI-BUG: ; NumSgprs: 96
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
call void @indirect_use_flat_scratch()
ret void
@@ -78,7 +78,7 @@ define void @use_10_vgpr() #1 {
}
; GCN-LABEL: {{^}}indirect_use_10_vgpr:
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
define void @indirect_use_10_vgpr() #0 {
call void @use_10_vgpr()
ret void
@@ -86,7 +86,7 @@ define void @indirect_use_10_vgpr() #0 {
; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
; GCN: is_dynamic_callstack = 0
-; GCN: ; NumVgprs: 41
+; GCN: ; NumVgprs: 42
define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
call void @indirect_use_10_vgpr()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index e9a82ebae7a8..53e3e7c4f8e2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -23,10 +23,12 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
; MUBUF: buffer_store_dword
+; MUBUF: buffer_store_dword
+; FLATSCR: scratch_store_dword
; FLATSCR: scratch_store_dword
-; GCN: v_writelane_b32 v40, s33, 4
; GCN: v_writelane_b32 v40, s30, 0
; GCN: v_writelane_b32 v40, s31, 1
+; GCN: v_writelane_b32 v41, s33, 0
; GCN: v_writelane_b32 v40, s34, 2
; GCN: v_writelane_b32 v40, s35, 3
@@ -41,8 +43,10 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
; FLATSCR-DAG: v_readlane_b32 s31, v40, 1
; FLATSCR-DAG: v_readlane_b32 s30, v40, 0
-; GCN: v_readlane_b32 s33, v40, 4
+; GCN: v_readlane_b32 s33, v41, 0
; MUBUF: buffer_load_dword
+; MUBUF: buffer_load_dword
+; FLATSCR: scratch_load_dword
; FLATSCR: scratch_load_dword
; GCN: s_setpc_b64 s[30:31]
define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
@@ -54,8 +58,10 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
; MUBUF: buffer_store_dword v40
+; MUBUF: buffer_store_dword v41
; FLATSCR: scratch_store_dword off, v40
-; GCN: v_writelane_b32 v40, s33, 4
+; FLATSCR: scratch_store_dword off, v41
+; GCN: v_writelane_b32 v41, s33, 0
; GCN: s_mov_b32 s33, s32
; MUBUF: s_addk_i32 s32, 0x400
@@ -63,9 +69,11 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; GCN: s_swappc_b64
; GCN-NEXT: s_swappc_b64
-; GCN: v_readlane_b32 s33, v40, 4
+; GCN: v_readlane_b32 s33, v41, 0
; MUBUF: buffer_load_dword v40
+; MUBUF: buffer_load_dword v41
; FLATSCR: scratch_load_dword v40
+; FLATSCR: scratch_load_dword v41
define void @test_func_call_external_void_funcx2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index f0af8f0d733f..68b8ab5ad069 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -87,9 +87,11 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
; GCN-NEXT: s_waitcnt
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2
+; GCN: v_writelane_b32 [[CSR_VGPR_1]], s33, 0
; GCN-DAG: s_mov_b32 s33, s32
; MUBUF-DAG: s_addk_i32 s32, 0x400{{$}}
; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}}
@@ -107,10 +109,12 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
; MUBUF: s_addk_i32 s32, 0xfc00{{$}}
; FLATSCR: s_add_i32 s32, s32, -16{{$}}
-; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
+; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR_1]], 0
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -132,11 +136,13 @@ define void @callee_with_stack_and_call() #0 {
; GCN: s_waitcnt
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; MUBUF-DAG: s_addk_i32 s32, 0x400
; FLATSCR-DAG: s_add_i32 s32, s32, 16
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s33, [[FP_SPILL_LANE:[0-9]+]]
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR_1]], s33, [[FP_SPILL_LANE:[0-9]+]]
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
@@ -147,10 +153,12 @@ define void @callee_with_stack_and_call() #0 {
; MUBUF: s_addk_i32 s32, 0xfc00
; FLATSCR: s_add_i32 s32, s32, -16
-; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]]
+; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR_1]], [[FP_SPILL_LANE]]
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -269,8 +277,8 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 v0, s33, 63
; GCN-COUNT-60: v_writelane_b32 v0
+; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33
; GCN: s_mov_b32 s33, s32
; GCN: v_writelane_b32 v0
; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -285,7 +293,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
; MUBUF: s_addk_i32 s32, 0xfc00
; FLATSCR: s_add_i32 s32, s32, 16
; FLATSCR: s_add_i32 s32, s32, -16
-; GCN-NEXT: v_readlane_b32 s33, v0, 63
+; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]]
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
@@ -389,7 +397,7 @@ define void @realign_stack_no_fp_elim() #1 {
; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
+; GCN-NEXT: s_mov_b32 vcc_lo, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0
; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
@@ -404,7 +412,7 @@ define void @realign_stack_no_fp_elim() #1 {
; GCN: v_readlane_b32 s30, [[CSR_VGPR]], 0
; MUBUF: s_addk_i32 s32, 0xfd00
; FLATSCR: s_add_i32 s32, s32, -12
-; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
+; GCN-NEXT: s_mov_b32 s33, vcc_lo
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
@@ -432,7 +440,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
+; GCN-NEXT: s_mov_b32 vcc_lo, s33
; GCN-NEXT: s_mov_b32 s33, s32
; MUBUF: s_addk_i32 s32, 0x300{{$}}
; FLATSCR: s_add_i32 s32, s32, 12{{$}}
@@ -443,7 +451,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; GCN: ;;#ASMSTART
; MUBUF: s_addk_i32 s32, 0xfd00{{$}}
; FLATSCR: s_add_i32 s32, s32, -12{{$}}
-; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
+; GCN-NEXT: s_mov_b32 s33, vcc_lo
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
@@ -480,7 +488,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004
; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
+; GCN-NEXT: s_mov_b32 vcc_lo, s33
; GCN-DAG: s_mov_b32 s33, s32
; MUBUF-DAG: s_add_i32 s32, s32, 0x40300{{$}}
; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}}
@@ -490,7 +498,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; GCN: ;;#ASMSTART
; MUBUF: s_add_i32 s32, s32, 0xfffbfd00{{$}}
; FLATSCR: s_addk_i32 s32, 0xeff4{{$}}
-; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
+; GCN-NEXT: s_mov_b32 s33, vcc_lo
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100
; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload
@@ -530,7 +538,7 @@ define internal void @local_empty_func() #0 {
; An FP is needed, despite not needing any spills
; TODO: Ccould see callee does not use stack and omit FP.
; GCN-LABEL: {{^}}ipra_call_with_stack:
-; GCN: v_writelane_b32 v0, s33, 2
+; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33
; GCN: s_mov_b32 s33, s32
; MUBUF: s_addk_i32 s32, 0x400
; FLATSCR: s_add_i32 s32, s32, 16
@@ -539,7 +547,7 @@ define internal void @local_empty_func() #0 {
; GCN: s_swappc_b64
; MUBUF: s_addk_i32 s32, 0xfc00
; FLATSCR: s_add_i32 s32, s32, -16
-; GCN: v_readlane_b32 s33, v0, 2
+; GCN: s_mov_b32 s33, [[TMP_SGPR]]
define void @ipra_call_with_stack() #0 {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 5d985850446c..a7b5e61b63be 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -29,8 +29,9 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -42,9 +43,10 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -63,8 +65,9 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -76,9 +79,10 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -97,8 +101,9 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -110,9 +115,10 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -131,8 +137,9 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -145,9 +152,10 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index 94fe3f0e9fc1..7c829bf9fdd8 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -18,8 +18,8 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
-; CHECK-NEXT: v_writelane_b32 v40, s33, 16
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
@@ -33,6 +33,7 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
+; CHECK-NEXT: v_writelane_b32 v42, s33, 0
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
@@ -91,9 +92,10 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
-; CHECK-NEXT: v_readlane_b32 s33, v40, 16
+; CHECK-NEXT: v_readlane_b32 s33, v42, 0
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 55cd662a0bd8..c47fdbac8665 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -12,8 +12,9 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1
; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
-; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s33, 2
+; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s33, 0
; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32
; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0
@@ -28,9 +29,10 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0
; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00
-; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v40, 2
+; SPILL-TO-VGPR-NEXT: v_readlane_b32 s33, v41, 0
; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[4:5], -1
; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 8301b369c5f4..049335211372 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -11,7 +11,6 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1
; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SDAG-NEXT: s_mov_b64 exec, s[34:35]
-; SDAG-NEXT: v_writelane_b32 v40, s33, 28
; SDAG-NEXT: v_writelane_b32 v40, s4, 0
; SDAG-NEXT: v_writelane_b32 v40, s5, 1
; SDAG-NEXT: v_writelane_b32 v40, s6, 2
@@ -32,6 +31,7 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-NEXT: v_writelane_b32 v40, s21, 17
; SDAG-NEXT: v_writelane_b32 v40, s22, 18
; SDAG-NEXT: v_writelane_b32 v40, s23, 19
+; SDAG-NEXT: s_mov_b32 s36, s33
; SDAG-NEXT: s_mov_b32 s33, s32
; SDAG-NEXT: s_addk_i32 s32, 0x400
; SDAG-NEXT: v_writelane_b32 v40, s24, 20
@@ -78,7 +78,7 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-NEXT: v_readlane_b32 s5, v40, 1
; SDAG-NEXT: v_readlane_b32 s4, v40, 0
; SDAG-NEXT: s_addk_i32 s32, 0xfc00
-; SDAG-NEXT: v_readlane_b32 s33, v40, 28
+; SDAG-NEXT: s_mov_b32 s33, s36
; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1
; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; SDAG-NEXT: s_mov_b64 exec, s[34:35]
@@ -91,7 +91,6 @@ define amdgpu_gfx void @gfx_func() {
; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 28
; GISEL-NEXT: v_writelane_b32 v40, s4, 0
; GISEL-NEXT: v_writelane_b32 v40, s5, 1
; GISEL-NEXT: v_writelane_b32 v40, s6, 2
@@ -112,6 +111,7 @@ define amdgpu_gfx void @gfx_func() {
; GISEL-NEXT: v_writelane_b32 v40, s21, 17
; GISEL-NEXT: v_writelane_b32 v40, s22, 18
; GISEL-NEXT: v_writelane_b32 v40, s23, 19
+; GISEL-NEXT: s_mov_b32 s36, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s24, 20
@@ -158,7 +158,7 @@ define amdgpu_gfx void @gfx_func() {
; GISEL-NEXT: v_readlane_b32 s5, v40, 1
; GISEL-NEXT: v_readlane_b32 s4, v40, 0
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 28
+; GISEL-NEXT: s_mov_b32 s33, s36
; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 56db9083be9f..9043bf065cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -99,8 +99,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -114,9 +115,10 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -127,25 +129,28 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -156,26 +161,29 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -186,25 +194,28 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -219,10 +230,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -236,9 +248,10 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -249,17 +262,18 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
@@ -267,9 +281,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -280,17 +296,19 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
@@ -298,9 +316,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -311,17 +331,18 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
@@ -329,9 +350,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -347,10 +370,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -364,9 +388,10 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -377,17 +402,18 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
@@ -395,9 +421,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -408,17 +436,19 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
@@ -426,9 +456,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -439,17 +471,18 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
@@ -457,9 +490,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -475,8 +510,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -489,9 +525,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -502,24 +539,27 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -530,25 +570,29 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -559,24 +603,27 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -591,10 +638,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -606,9 +654,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -619,25 +668,28 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_signext at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -648,26 +700,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -678,25 +734,28 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -712,10 +771,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -727,9 +787,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -740,25 +801,28 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_zeroext at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -769,26 +833,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -799,25 +867,28 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -833,8 +904,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -847,9 +919,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -860,24 +933,27 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -888,25 +964,29 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -917,24 +997,27 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -949,10 +1032,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -964,9 +1048,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -977,25 +1062,28 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_signext at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1006,26 +1094,30 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1036,25 +1128,28 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -1070,10 +1165,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1085,9 +1181,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1098,25 +1195,28 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_zeroext at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1127,26 +1227,30 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1157,25 +1261,28 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -1191,8 +1298,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1205,9 +1313,10 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1218,24 +1327,27 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1246,25 +1358,29 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1275,24 +1391,27 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -1307,8 +1426,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1322,9 +1442,10 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1335,25 +1456,28 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1364,25 +1488,29 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1393,25 +1521,28 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -1426,11 +1557,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1442,9 +1574,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1455,26 +1588,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64 at rel32@hi+12
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1485,27 +1621,31 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1516,26 +1656,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -1551,8 +1694,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1568,9 +1712,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1581,27 +1726,30 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1612,26 +1760,30 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1642,27 +1794,30 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -1677,11 +1832,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1695,9 +1851,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1708,28 +1865,31 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 1
; GFX10-NEXT: v_mov_b32_e32 v5, 2
-; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i64 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i64 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1740,27 +1900,31 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1771,28 +1935,31 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -1810,11 +1977,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1830,9 +1998,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1843,30 +2012,33 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 1
; GFX10-NEXT: v_mov_b32_e32 v5, 2
; GFX10-NEXT: v_mov_b32_e32 v6, 3
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v7, 4
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i64 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i64 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1877,28 +2049,32 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1909,30 +2085,33 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -1949,8 +2128,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1963,9 +2143,10 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1976,24 +2157,27 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2004,25 +2188,29 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2033,24 +2221,27 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -2065,8 +2256,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -2079,9 +2271,10 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2092,24 +2285,27 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2120,25 +2316,29 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2149,24 +2349,27 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -2181,8 +2384,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -2196,9 +2400,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2209,25 +2414,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2238,25 +2446,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2267,25 +2479,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -2300,8 +2515,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -2316,9 +2532,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2329,26 +2546,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 4.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2359,26 +2579,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_mov_b32_e32 v2, 4.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2389,26 +2613,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -2423,8 +2650,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -2441,9 +2669,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2454,28 +2683,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 4.0
; GFX10-NEXT: v_mov_b32_e32 v3, -1.0
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0.5
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2486,27 +2718,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 0.5
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2517,28 +2753,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -2553,8 +2792,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -2568,9 +2808,10 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2581,25 +2822,28 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2610,25 +2854,29 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2639,25 +2887,28 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -2672,8 +2923,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -2689,9 +2941,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2702,27 +2955,30 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2733,26 +2989,30 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2763,27 +3023,30 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -2798,8 +3061,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -2817,9 +3081,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2830,16 +3095,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
@@ -2850,9 +3116,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2863,13 +3131,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
@@ -2881,9 +3151,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2894,16 +3166,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
@@ -2914,9 +3187,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -2931,9 +3206,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -2945,9 +3221,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2958,24 +3235,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2986,25 +3266,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3015,24 +3299,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3048,9 +3335,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -3062,9 +3350,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3075,24 +3364,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3103,25 +3395,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3132,24 +3428,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3165,9 +3464,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -3179,9 +3479,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3192,24 +3493,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3220,25 +3524,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3249,24 +3557,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3282,8 +3593,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -3297,9 +3609,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3310,25 +3623,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: v_mov_b32_e32 v1, 3
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3339,25 +3655,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3368,25 +3688,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3401,8 +3724,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -3416,9 +3740,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3429,25 +3754,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3458,26 +3786,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3488,25 +3820,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3521,9 +3856,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -3535,9 +3871,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3548,24 +3885,27 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3576,25 +3916,29 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3605,24 +3949,27 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3638,8 +3985,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -3653,9 +4001,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3666,25 +4015,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3695,26 +4047,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3725,25 +4081,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3758,9 +4117,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -3772,9 +4132,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3785,24 +4146,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3813,25 +4177,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3842,24 +4210,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3875,9 +4246,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -3889,9 +4261,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3902,24 +4275,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3930,25 +4306,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3959,24 +4339,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -3992,8 +4375,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -4007,9 +4391,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4020,25 +4405,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4049,25 +4437,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4078,25 +4470,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -4111,8 +4506,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -4127,9 +4523,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4140,26 +4537,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 4
; GFX10-NEXT: v_mov_b32_e32 v2, 5
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4170,26 +4570,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT: v_mov_b32_e32 v2, 5
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4200,26 +4604,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -4234,8 +4641,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -4251,9 +4659,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4264,27 +4673,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 4
; GFX10-NEXT: v_mov_b32_e32 v2, 5
; GFX10-NEXT: v_mov_b32_e32 v3, 6
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4295,26 +4707,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4325,27 +4741,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -4360,9 +4779,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -4374,9 +4794,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4387,24 +4808,27 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4415,25 +4839,29 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4444,24 +4872,27 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -4477,8 +4908,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -4494,9 +4926,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4507,27 +4940,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4538,26 +4974,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4568,27 +5008,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -4603,8 +5046,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -4621,9 +5065,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4634,28 +5079,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 5
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4666,27 +5114,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 5
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4697,28 +5149,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -4733,10 +5188,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -4751,9 +5207,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4764,29 +5221,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35]
; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4797,30 +5257,34 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1]
; GFX11-NEXT: global_load_b128 v[4:7], v4, s[0:1] offset:16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4831,29 +5295,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -4870,8 +5337,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -4891,9 +5359,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4904,18 +5373,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 5
; GFX10-NEXT: v_mov_b32_e32 v5, 6
; GFX10-NEXT: v_mov_b32_e32 v6, 7
; GFX10-NEXT: v_mov_b32_e32 v7, 8
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
@@ -4926,9 +5396,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4939,14 +5411,16 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
@@ -4958,9 +5432,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4971,18 +5447,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
@@ -4993,9 +5470,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -5010,10 +5489,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v16, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -5030,9 +5510,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5043,11 +5524,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -5056,18 +5539,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32
; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v16i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5078,11 +5562,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v12, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -5091,19 +5578,20 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX11-NEXT: global_load_b128 v[4:7], v12, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[8:11], v12, s[0:1] offset:32
; GFX11-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:48
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -5114,11 +5602,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
@@ -5127,18 +5617,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -5155,10 +5646,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v28, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -5180,9 +5672,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5193,11 +5686,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -5210,18 +5705,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5232,11 +5728,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v28, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -5249,19 +5748,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80
; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -5272,11 +5772,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
@@ -5289,18 +5791,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -5317,11 +5820,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v28, 0
; GFX9-NEXT: global_load_dword v32, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35]
@@ -5345,9 +5849,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5358,11 +5863,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dword v33, v[0:1], off
@@ -5376,20 +5883,21 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32_i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5400,11 +5908,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v28, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: global_load_b32 v32, v[0:1], off
@@ -5418,20 +5929,21 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80
; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: scratch_store_b32 off, v32, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -5442,11 +5954,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off
@@ -5460,20 +5974,21 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8)
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -5491,10 +6006,11 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v43, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -5512,10 +6028,11 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: s_addk_i32 s32, 0xf800
+; GFX9-NEXT: v_readlane_b32 s33, v43, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5526,21 +6043,22 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v43, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: v_mov_b32_e32 v41, v0
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_mov_b32_e32 v41, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
-; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: v_mov_b32_e32 v42, v1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: global_store_dword v[41:42], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -5549,10 +6067,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: s_addk_i32 s32, 0xfc00
+; GFX10-NEXT: v_readlane_b32 s33, v43, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5563,21 +6083,24 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v43, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4
; GFX11-NEXT: scratch_store_b32 off, v42, s33
-; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_add_i32 s32, s32, 32
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -5586,10 +6109,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: s_addk_i32 s32, 0xffe0
+; GFX11-NEXT: v_readlane_b32 s33, v43, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -5600,21 +6125,22 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s32 offset:12 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
@@ -5623,10 +6149,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v43, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8
+; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s32 offset:12
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -5642,10 +6170,11 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -5660,9 +6189,10 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5673,29 +6203,32 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35]
; GFX10-NEXT: global_load_dword v1, v2, s[34:35] offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_struct_i8_i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5706,30 +6239,34 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u8 v0, v1, s[0:1]
; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -5740,29 +6277,32 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1]
; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -5779,13 +6319,14 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33
@@ -5796,10 +6337,11 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: s_addk_i32 s32, 0xf800
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5810,17 +6352,18 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 8
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32 at rel32@hi+12
@@ -5828,10 +6371,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: s_addk_i32 s32, 0xfc00
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5842,17 +6387,19 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b8 off, v0, s33
; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4
; GFX11-NEXT: v_mov_b32_e32 v0, s33
+; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
@@ -5861,10 +6408,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: s_addk_i32 s32, 0xffe0
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -5875,17 +6424,18 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
@@ -5893,10 +6443,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -5916,8 +6468,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33
@@ -5938,7 +6491,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -5946,6 +6499,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5956,23 +6510,24 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x400
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33
+; GFX10-NEXT: s_addk_i32 s32, 0x400
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8
@@ -5980,14 +6535,16 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -5998,10 +6555,12 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -6021,14 +6580,16 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -6039,10 +6600,11 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:16 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:20 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8
@@ -6063,14 +6625,16 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: global_store_dword v[0:1], v1, off
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:20
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -6098,10 +6662,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -6134,9 +6699,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -6147,14 +6713,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35]
@@ -6185,9 +6752,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -6198,14 +6767,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1]
@@ -6233,9 +6804,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -6246,14 +6819,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
@@ -6284,9 +6858,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -6304,7 +6880,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 32
+; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16
@@ -6383,7 +6959,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 32
+; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -6682,8 +7258,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -6697,9 +7274,10 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -6710,25 +7288,28 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -6739,26 +7320,29 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -6769,25 +7353,28 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -6802,9 +7389,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -6818,9 +7406,10 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -6831,26 +7420,29 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -6861,17 +7453,19 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -6879,9 +7473,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -6892,26 +7488,29 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -6926,9 +7525,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -6942,9 +7542,10 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -6955,26 +7556,29 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -6985,17 +7589,19 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -7003,9 +7609,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -7016,26 +7624,29 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -7050,9 +7661,10 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -7066,9 +7678,10 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7079,26 +7692,29 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 42
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 42
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -7109,17 +7725,19 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 42
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 42
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -7127,9 +7745,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -7140,26 +7760,29 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -7174,10 +7797,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -7193,9 +7817,10 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7206,18 +7831,19 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -7226,9 +7852,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -7239,18 +7867,20 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -7260,9 +7890,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -7273,18 +7905,19 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -7293,9 +7926,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -7310,14 +7945,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
@@ -7333,9 +7969,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7346,13 +7983,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b64 s[34:35], 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -7370,9 +8008,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -7383,13 +8023,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -7408,9 +8050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -7421,13 +8065,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -7445,9 +8090,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -7463,12 +8110,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
@@ -7488,9 +8136,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7501,18 +8150,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -7527,9 +8177,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -7540,18 +8192,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -7567,9 +8221,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -7580,18 +8236,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -7606,9 +8263,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -7623,8 +8282,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 8
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -7633,6 +8292,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 6
@@ -7652,9 +8312,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 8
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7665,13 +8326,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 8
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b64 s[34:35], 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -7695,9 +8357,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 8
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -7708,13 +8372,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 8
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -7739,9 +8405,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 8
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -7752,13 +8420,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -7782,9 +8451,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -7802,8 +8473,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 10
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -7814,6 +8485,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
; GFX9-NEXT: v_writelane_b32 v40, s10, 6
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 8
@@ -7837,9 +8509,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 10
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7850,13 +8523,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b64 s[34:35], 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -7886,9 +8560,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -7899,13 +8575,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 10
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -7936,9 +8614,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 10
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -7949,13 +8629,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -7985,9 +8666,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -8004,9 +8687,10 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -8020,9 +8704,10 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -8033,26 +8718,29 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_movk_i32 s4, 0x4400
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x4400
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8063,17 +8751,19 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_movk_i32 s4, 0x4400
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x4400
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -8081,9 +8771,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -8094,26 +8786,29 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -8128,9 +8823,10 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -8144,9 +8840,10 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -8157,26 +8854,29 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 4.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 4.0
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8187,17 +8887,19 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 4.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 4.0
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -8205,9 +8907,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -8218,26 +8922,29 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -8252,10 +8959,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -8271,9 +8979,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -8284,18 +8993,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1.0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -8304,9 +9014,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8317,18 +9029,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8338,9 +9052,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -8351,18 +9067,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8371,9 +9088,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -8388,11 +9107,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 5
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 3
@@ -8410,9 +9130,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 5
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -8423,18 +9144,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 5
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1.0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 4.0
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
@@ -8446,9 +9168,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 5
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8459,18 +9183,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 5
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 4.0
; GFX11-NEXT: v_writelane_b32 v40, s30, 3
@@ -8483,9 +9209,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 5
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -8496,18 +9224,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
@@ -8519,9 +9248,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -8536,13 +9267,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 7
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 5
@@ -8564,9 +9296,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 7
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -8577,18 +9310,19 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 7
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[34:35]
-; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg at rel32@lo+4
-; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg at rel32@hi+12
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, 1.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_mov_b32 s5, 2.0
+; GFX10-NEXT: s_getpc_b64 s[34:35]
+; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg at rel32@lo+4
+; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg at rel32@hi+12
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 4.0
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -8606,9 +9340,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 7
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8619,18 +9355,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 7
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 4.0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -8649,9 +9387,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 7
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -8662,18 +9402,19 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -8691,9 +9432,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -8708,10 +9451,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -8727,9 +9471,10 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -8740,18 +9485,19 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -8760,9 +9506,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8773,18 +9521,20 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 0x40100000
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0x40100000
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8794,9 +9544,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -8807,18 +9559,19 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -8827,9 +9580,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -8844,12 +9599,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
@@ -8869,9 +9625,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -8882,18 +9639,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -8908,9 +9666,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8921,18 +9681,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -8948,9 +9710,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -8961,18 +9725,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -8987,9 +9752,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9004,14 +9771,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 8
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 6
@@ -9035,9 +9803,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 8
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -9048,18 +9817,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 8
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -9080,9 +9850,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 8
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9093,18 +9865,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 8
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -9126,9 +9900,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 8
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -9139,18 +9915,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -9171,9 +9948,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9188,10 +9967,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -9204,9 +9984,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -9217,26 +9998,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9247,17 +10031,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9265,9 +10051,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -9278,26 +10066,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9313,11 +10104,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -9331,9 +10123,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -9344,12 +10137,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_getpc_b64 s[34:35]
@@ -9363,9 +10157,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9376,12 +10172,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -9396,9 +10194,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -9409,12 +10209,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
@@ -9428,9 +10229,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9446,11 +10249,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -9464,9 +10268,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -9477,12 +10282,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_getpc_b64 s[34:35]
@@ -9496,9 +10302,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9509,12 +10317,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -9529,9 +10339,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -9542,12 +10354,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
@@ -9561,9 +10374,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9579,10 +10394,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -9598,9 +10414,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -9611,18 +10428,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 3
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x20001
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 3
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -9631,9 +10449,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9644,18 +10464,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0x20001
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 3
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0x20001
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 3
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -9665,9 +10487,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -9678,18 +10502,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -9698,9 +10523,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9715,10 +10542,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -9734,9 +10562,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -9747,18 +10576,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_movk_i32 s5, 0x4400
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x40003c00
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_movk_i32 s5, 0x4400
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -9767,9 +10597,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9780,18 +10612,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_movk_i32 s5, 0x4400
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0x40003c00
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_movk_i32 s5, 0x4400
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -9801,9 +10635,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -9814,18 +10650,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -9834,9 +10671,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9851,11 +10690,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -9869,9 +10709,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -9882,12 +10723,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_getpc_b64 s[34:35]
@@ -9901,9 +10743,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9914,12 +10758,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -9934,9 +10780,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -9947,12 +10795,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
@@ -9966,9 +10815,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9984,10 +10835,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -10003,9 +10855,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10016,18 +10869,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 0x40003
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x20001
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0x40003
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -10036,9 +10890,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -10049,18 +10905,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0x20001
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 0x40003
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0x20001
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0x40003
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -10070,9 +10928,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -10083,18 +10943,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -10103,9 +10964,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -10120,10 +10983,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -10136,9 +11000,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10149,26 +11014,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -10179,17 +11047,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10197,9 +11067,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -10210,26 +11082,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -10245,11 +11120,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -10263,9 +11139,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10276,12 +11153,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_getpc_b64 s[34:35]
@@ -10295,9 +11173,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -10308,12 +11188,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -10328,9 +11210,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -10341,12 +11225,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
@@ -10360,9 +11245,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -10378,10 +11265,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -10397,9 +11285,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10410,18 +11299,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
@@ -10430,9 +11320,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -10443,18 +11335,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -10464,9 +11358,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -10477,18 +11373,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -10497,9 +11394,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -10514,11 +11413,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 5
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 3
@@ -10536,9 +11436,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 5
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10549,18 +11450,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 5
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 3
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 3
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 5
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
@@ -10572,9 +11474,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 5
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -10585,18 +11489,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 5
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 3
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 3
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 5
; GFX11-NEXT: v_writelane_b32 v40, s30, 3
@@ -10609,9 +11515,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 5
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -10622,18 +11530,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
@@ -10645,9 +11554,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -10662,12 +11573,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
@@ -10687,9 +11599,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10700,18 +11613,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 3
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 3
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 5
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -10726,9 +11640,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -10739,18 +11655,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 3
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 3
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 5
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -10766,9 +11684,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -10779,18 +11699,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -10805,9 +11726,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -10822,13 +11745,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
@@ -10844,9 +11768,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -10857,12 +11782,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -10880,9 +11806,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -10893,12 +11821,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -10917,9 +11847,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -10930,12 +11862,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -10953,9 +11886,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -10971,12 +11906,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
@@ -10996,9 +11932,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -11009,18 +11946,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -11035,9 +11973,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -11048,18 +11988,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -11075,9 +12017,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -11088,18 +12032,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -11114,9 +12059,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -11131,13 +12078,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 7
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 5
@@ -11159,9 +12107,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 7
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -11172,18 +12121,19 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 7
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -11201,9 +12151,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 7
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -11214,18 +12166,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 7
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -11244,9 +12198,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 7
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -11257,18 +12213,19 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -11286,9 +12243,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -11303,8 +12262,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 10
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -11316,6 +12275,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 8
@@ -11335,9 +12295,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 10
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -11348,13 +12309,14 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -11381,9 +12343,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -11394,13 +12358,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 10
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -11428,9 +12394,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 10
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -11441,13 +12409,14 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -11474,9 +12443,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -11493,8 +12464,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 10
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -11503,6 +12474,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
; GFX9-NEXT: v_writelane_b32 v40, s10, 6
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 8
@@ -11530,9 +12502,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 10
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -11543,18 +12516,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -11581,9 +12555,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -11594,18 +12570,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 10
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -11633,9 +12611,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 10
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -11646,18 +12626,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -11684,9 +12665,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -11701,8 +12684,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 18
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -11722,6 +12705,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s19, 15
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 16
@@ -11749,9 +12733,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 18
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -11762,13 +12747,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 18
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -11811,9 +12797,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 18
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -11824,13 +12812,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 18
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -11874,9 +12864,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 18
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -11887,13 +12879,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -11936,9 +12929,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 18
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -11955,8 +12950,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 28
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -11985,6 +12980,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s25, 21
; GFX9-NEXT: v_writelane_b32 v40, s26, 22
; GFX9-NEXT: v_writelane_b32 v40, s27, 23
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s28, 24
@@ -12047,9 +13043,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 28
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -12060,13 +13057,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 28
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -12154,9 +13152,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 28
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12167,13 +13167,15 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 28
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -12256,9 +13258,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 28
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -12269,13 +13273,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -12359,9 +13364,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -12378,8 +13385,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 28
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
@@ -12409,6 +13416,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s24, 20
; GFX9-NEXT: v_writelane_b32 v40, s25, 21
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s26, 22
@@ -12475,9 +13483,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 28
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -12488,13 +13497,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 28
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
@@ -12587,9 +13597,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 28
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12600,13 +13612,15 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 28
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
@@ -12692,9 +13706,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 28
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -12705,13 +13721,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
@@ -12800,9 +13817,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -12820,12 +13839,13 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4
-; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[34:35]
@@ -12838,10 +13858,11 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: s_addk_i32 s32, 0xf800
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -12852,15 +13873,16 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg at rel32@hi+12
@@ -12872,10 +13894,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: s_addk_i32 s32, 0xfc00
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12886,26 +13910,30 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33
+; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: s_addk_i32 s32, 0xffe0
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -12916,26 +13944,29 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -12951,8 +13982,9 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_mov_b32_e32 v0, 12
@@ -13004,9 +14036,10 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -13017,12 +14050,13 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: v_mov_b32_e32 v0, 12
; GFX10-NEXT: v_mov_b32_e32 v1, 13
; GFX10-NEXT: v_mov_b32_e32 v2, 14
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_mov_b32_e32 v3, 15
@@ -13071,9 +14105,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -13084,18 +14120,20 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1
+; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1
; GFX11-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, 2
; GFX11-NEXT: v_dual_mov_b32 v8, 2 :: v_dual_mov_b32 v9, 3
; GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_mov_b32 v11, 3
@@ -13118,9 +14156,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -13131,22 +14171,23 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2
@@ -13182,9 +14223,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -13212,8 +14255,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_mov_b32_e32 v0, 8
@@ -13273,9 +14317,10 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -13286,12 +14331,13 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 8
; GFX10-NEXT: v_mov_b32_e32 v1, 9
; GFX10-NEXT: v_mov_b32_e32 v2, 10
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
@@ -13348,9 +14394,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -13361,13 +14409,15 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9
; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
@@ -13399,9 +14449,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -13412,9 +14464,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14
@@ -13423,6 +14475,7 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
@@ -13468,9 +14521,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -13494,8 +14549,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000
@@ -13555,9 +14611,10 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -13568,12 +14625,13 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000
; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
@@ -13630,9 +14688,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -13643,9 +14703,10 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_mov_b32_e32 v0, 0x41400000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x41500000
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41600000
@@ -13654,6 +14715,7 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_mov_b32_e32 v5, 0x41100000
; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000
; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
@@ -13687,9 +14749,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -13700,9 +14764,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000
@@ -13711,6 +14775,7 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
@@ -13756,9 +14821,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 6455b8940f38..b21ea4ec8182 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -11,10 +11,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
@@ -31,9 +32,10 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -44,12 +46,13 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -65,9 +68,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -78,12 +83,14 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_getpc_b64 s[4:5]
; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -100,9 +107,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -213,9 +222,10 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -236,9 +246,10 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -249,15 +260,16 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: ;;#ASMSTART
@@ -273,9 +285,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -286,15 +300,17 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: ;;#ASMSTART
@@ -310,9 +326,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -328,8 +346,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v42, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -351,9 +370,10 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v42, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -364,21 +384,22 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def v31
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v41, v31
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_mov_b32_e32 v31, v41
; GFX10-NEXT: ;;#ASMSTART
@@ -388,9 +409,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v42, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -401,21 +424,24 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v42, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def v31
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v41, v31
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_mov_b32_e32 v31, v41
; GFX11-NEXT: ;;#ASMSTART
@@ -425,9 +451,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v42, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -444,9 +472,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -467,9 +496,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -480,20 +510,21 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s33
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, s33
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_mov_b32 s33, s4
@@ -504,9 +535,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -517,20 +550,22 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s33
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b32 s4, s33
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_mov_b32 s33, s4
@@ -542,9 +577,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -560,9 +597,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -583,9 +621,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -596,20 +635,21 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[36:37]
; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_void at rel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s34
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, s34
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX10-NEXT: s_mov_b32 s34, s4
@@ -620,9 +660,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -633,20 +675,22 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s34
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b32 s4, s34
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_mov_b32 s34, s4
@@ -658,9 +702,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -676,8 +722,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v41, s33, 2
+; GFX9-NEXT: v_writelane_b32 v42, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v41, s30, 0
@@ -697,9 +744,10 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: v_readlane_b32 s30, v41, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v41, 2
+; GFX9-NEXT: v_readlane_b32 s33, v42, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -710,20 +758,21 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v41, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s30, 0
+; GFX10-NEXT: v_writelane_b32 v42, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v41, s31, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def v40
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v41, s30, 0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v41, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use v40
@@ -732,9 +781,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: v_readlane_b32 s30, v41, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v41, 2
+; GFX10-NEXT: v_readlane_b32 s33, v42, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -745,20 +796,23 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v41, s33, 2
+; GFX11-NEXT: v_writelane_b32 v41, s30, 0
+; GFX11-NEXT: v_writelane_b32 v42, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: v_writelane_b32 v41, s31, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def v40
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v41, s30, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v41, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use v40
@@ -767,9 +821,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: v_readlane_b32 s30, v41, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v41, 2
+; GFX11-NEXT: v_readlane_b32 s33, v42, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -909,8 +965,9 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -922,9 +979,10 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -935,23 +993,26 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s33 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -962,24 +1023,28 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s33 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -993,8 +1058,9 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
@@ -1006,9 +1072,10 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1019,23 +1086,26 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34 at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s34 at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1046,24 +1116,28 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s34 at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1077,9 +1151,10 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -1099,9 +1174,10 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1112,20 +1188,21 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s40
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, s40
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: ;;#ASMSTART
@@ -1135,9 +1212,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1148,20 +1227,22 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s40
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b32 s4, s40
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: ;;#ASMSTART
@@ -1172,9 +1253,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1190,9 +1273,10 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v42, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
@@ -1221,9 +1305,10 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v42, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1234,26 +1319,27 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v42, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def s40
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, s40
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; def v32
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_mov_b32_e32 v41, v32
+; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
-; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
@@ -1266,9 +1352,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v42, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1279,26 +1367,29 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v42, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s40
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b32 s4, s40
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def v32
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: v_mov_b32_e32 v41, v32
+; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
-; GFX11-NEXT: v_writelane_b32 v40, s31, 2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s4
@@ -1311,9 +1402,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v42, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 54fb7ed36e8c..c8cdb7b9a787 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -27,7 +27,7 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v1, s33, 2
+; GFX9-NEXT: s_mov_b32 s36, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[34:35]
@@ -41,7 +41,7 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v1, 2
+; GFX9-NEXT: s_mov_b32 s33, s36
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
@@ -56,21 +56,21 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v1, s33, 2
+; GFX10-NEXT: s_mov_b32 s36, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, return_i1 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, return_i1 at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v1, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v1, 2
+; GFX10-NEXT: s_mov_b32 s33, s36
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -85,14 +85,14 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v1, s33, 2
+; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, return_i1 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, return_i1 at gotpcrel32@hi+12
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v1, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -100,7 +100,7 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v1, 2
+; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -135,7 +135,7 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v1, s33, 2
+; GFX9-NEXT: s_mov_b32 s36, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[34:35]
@@ -149,7 +149,7 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v1, 2
+; GFX9-NEXT: s_mov_b32 s33, s36
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
@@ -164,21 +164,21 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v1, s33, 2
+; GFX10-NEXT: s_mov_b32 s36, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, return_i16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, return_i16 at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v1, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v1, 2
+; GFX10-NEXT: s_mov_b32 s33, s36
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -193,14 +193,14 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v1, s33, 2
+; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, return_i16 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, return_i16 at gotpcrel32@hi+12
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v1, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -208,7 +208,7 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v1, 2
+; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -243,7 +243,7 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v1, s33, 2
+; GFX9-NEXT: s_mov_b32 s36, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[34:35]
@@ -257,7 +257,7 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v1, 2
+; GFX9-NEXT: s_mov_b32 s33, s36
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
@@ -272,21 +272,21 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v1, s33, 2
+; GFX10-NEXT: s_mov_b32 s36, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, return_2xi16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, return_2xi16 at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v1, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v1, 2
+; GFX10-NEXT: s_mov_b32 s33, s36
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -301,14 +301,14 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v1, s33, 2
+; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, return_2xi16 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, return_2xi16 at gotpcrel32@hi+12
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v1, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -316,7 +316,7 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v1, 2
+; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -360,7 +360,7 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v2, s33, 2
+; GFX9-NEXT: s_mov_b32 s36, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[34:35]
@@ -374,7 +374,7 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v2, 2
+; GFX9-NEXT: s_mov_b32 s33, s36
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
@@ -389,21 +389,21 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v2, s33, 2
+; GFX10-NEXT: s_mov_b32 s36, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, return_3xi16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, return_3xi16 at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v2, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v2, 2
+; GFX10-NEXT: s_mov_b32 s33, s36
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -418,14 +418,14 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v2, s33, 2
+; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, return_3xi16 at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, return_3xi16 at gotpcrel32@hi+12
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -433,7 +433,7 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v2, 2
+; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -1641,7 +1641,7 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v2, s33, 2
+; GFX9-NEXT: s_mov_b32 s36, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x1ffc0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffe0000
; GFX9-NEXT: s_add_i32 s32, s32, 0x60000
@@ -1657,7 +1657,7 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000
-; GFX9-NEXT: v_readlane_b32 s33, v2, 2
+; GFX9-NEXT: s_mov_b32 s33, s36
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
@@ -1672,7 +1672,7 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v2, s33, 2
+; GFX10-NEXT: s_mov_b32 s36, s33
; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0
; GFX10-NEXT: s_add_i32 s32, s32, 0x30000
; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000
@@ -1688,7 +1688,7 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000
-; GFX10-NEXT: v_readlane_b32 s33, v2, 2
+; GFX10-NEXT: s_mov_b32 s33, s36
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -1703,7 +1703,7 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v5, s32 offset:2048 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v5, s33, 2
+; GFX11-NEXT: s_mov_b32 s34, s33
; GFX11-NEXT: s_add_i32 s33, s32, 0x7ff
; GFX11-NEXT: s_addk_i32 s32, 0x1800
; GFX11-NEXT: s_and_b32 s33, s33, 0xfffff800
@@ -1720,7 +1720,7 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX11-NEXT: v_readlane_b32 s31, v5, 1
; GFX11-NEXT: v_readlane_b32 s30, v5, 0
; GFX11-NEXT: s_addk_i32 s32, 0xe800
-; GFX11-NEXT: v_readlane_b32 s33, v5, 2
+; GFX11-NEXT: s_mov_b32 s33, s34
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:2048 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index ae55155bbfc3..c7e2047d487c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -395,8 +395,9 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 18
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -465,9 +466,10 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 18
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -477,8 +479,9 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[16:17]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 18
+; GISEL-NEXT: v_writelane_b32 v41, s33, 0
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
@@ -547,9 +550,10 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 18
+; GISEL-NEXT: v_readlane_b32 s33, v41, 0
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -563,8 +567,9 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 18
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -636,9 +641,10 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 18
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -648,8 +654,9 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[16:17]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 18
+; GISEL-NEXT: v_writelane_b32 v41, s33, 0
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
@@ -719,9 +726,10 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 18
+; GISEL-NEXT: v_readlane_b32 s33, v41, 0
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -735,8 +743,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 18
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -807,9 +816,10 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 18
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -819,8 +829,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[16:17]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 18
+; GISEL-NEXT: v_writelane_b32 v41, s33, 0
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
@@ -891,9 +902,10 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 18
+; GISEL-NEXT: v_readlane_b32 s33, v41, 0
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -908,8 +920,9 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 20
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -989,9 +1002,10 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 20
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1001,8 +1015,9 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_or_saveexec_b64 s[16:17], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[16:17]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 20
+; GISEL-NEXT: v_writelane_b32 v41, s33, 0
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
@@ -1082,9 +1097,10 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 20
+; GISEL-NEXT: v_readlane_b32 s33, v41, 0
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1106,7 +1122,7 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_writelane_b32 v40, s33, 32
+; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -1187,7 +1203,7 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 32
+; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -1200,7 +1216,7 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 32
+; GISEL-NEXT: s_mov_b32 s5, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
@@ -1281,7 +1297,7 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 32
+; GISEL-NEXT: s_mov_b32 s33, s5
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
@@ -1298,7 +1314,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_writelane_b32 v40, s33, 32
+; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1383,7 +1399,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 32
+; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -1396,7 +1412,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 32
+; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1481,7 +1497,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 32
+; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
@@ -1502,7 +1518,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_writelane_b32 v40, s33, 32
+; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -1585,7 +1601,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 32
+; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -1598,7 +1614,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 32
+; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
@@ -1681,7 +1697,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 32
+; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
@@ -1699,7 +1715,7 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_writelane_b32 v40, s33, 32
+; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
@@ -1779,7 +1795,7 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 32
+; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -1792,7 +1808,7 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; GISEL-NEXT: v_writelane_b32 v40, s33, 32
+; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
@@ -1872,7 +1888,7 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
-; GISEL-NEXT: v_readlane_b32 s33, v40, 32
+; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index ac9540bc0d84..3066daa47eac 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -190,10 +190,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 5
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: v_writelane_b32 v44, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
@@ -229,9 +230,10 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 5
+; GFX9-NEXT: v_readlane_b32 s33, v44, 0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 109163bf7373..a5ef9dc00150 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -30,7 +30,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-NEXT: v_writelane_b32 v1, s33, 2
+; CHECK-NEXT: s_mov_b32 s6, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -51,7 +51,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-NEXT: v_readlane_b32 s30, v1, 0
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT: v_readlane_b32 s33, v1, 2
+; CHECK-NEXT: s_mov_b32 s33, s6
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
@@ -155,7 +155,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-NEXT: v_writelane_b32 v1, s33, 2
+; CHECK-NEXT: s_mov_b32 s6, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
; CHECK-NEXT: v_writelane_b32 v1, s30, 0
@@ -171,7 +171,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: v_readlane_b32 s30, v1, 0
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT: v_readlane_b32 s33, v1, 2
+; CHECK-NEXT: s_mov_b32 s33, s6
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
@@ -189,7 +189,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-NEXT: v_writelane_b32 v2, s33, 2
+; CHECK-NEXT: s_mov_b32 s7, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
; CHECK-NEXT: v_writelane_b32 v2, s30, 0
@@ -205,7 +205,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-NEXT: v_readlane_b32 s31, v2, 1
; CHECK-NEXT: v_readlane_b32 s30, v2, 0
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT: v_readlane_b32 s33, v2, 2
+; CHECK-NEXT: s_mov_b32 s33, s7
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 88e3949370e2..50948bd2134f 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -13,8 +13,9 @@ declare void @external_void_func_i32(i32) #0
; Spill CSR VGPR used for SGPR spilling
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-DAG: v_writelane_b32 v40, s33, 2
+; GCN-DAG: v_writelane_b32 v41, s33, 0
; GCN-DAG: s_mov_b32 s33, s32
; GCN-DAG: s_addk_i32 s32, 0x400
; GCN-DAG: v_writelane_b32 v40, s30, 0
@@ -26,9 +27,10 @@ declare void @external_void_func_i32(i32) #0
; GCN: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 2
+; GCN-NEXT: v_readlane_b32 s33, v41, 0
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index e28ca75e2947..e69f26ec2d89 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -15,8 +15,9 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
-; CHECK-NEXT: v_writelane_b32 v40, s33, 2
+; CHECK-NEXT: v_writelane_b32 v41, s33, 0
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
@@ -39,9 +40,10 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT: v_readlane_b32 s33, v40, 2
+; CHECK-NEXT: v_readlane_b32 s33, v41, 0
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
index ae337a6d78be..8dec61e52054 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
@@ -24,7 +24,7 @@ body: |
liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
; GFX8-LABEL: name: pei_scavenge_vgpr_spill
- ; GFX8: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2
+ ; GFX8: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc
@@ -49,7 +49,7 @@ body: |
; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GFX8-NEXT: S_ENDPGM 0, amdgpu_allvgprs
; GFX9-LABEL: name: pei_scavenge_vgpr_spill
- ; GFX9: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2
+ ; GFX9: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def dead $scc
@@ -72,7 +72,7 @@ body: |
; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GFX9-NEXT: S_ENDPGM 0, amdgpu_allvgprs
; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill
- ; GFX9-FLATSCR: liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr2
+ ; GFX9-FLATSCR: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
; GFX9-FLATSCR-NEXT: {{ $}}
; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def dead $scc
diff --git a/llvm/test/CodeGen/AMDGPU/save-fp.ll b/llvm/test/CodeGen/AMDGPU/save-fp.ll
index ec56f41aa1a0..ccc7e57cb56b 100644
--- a/llvm/test/CodeGen/AMDGPU/save-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/save-fp.ll
@@ -11,14 +11,14 @@ bb:
; GCN-LABEL: {{^}}caller:
-; GCN: v_writelane_b32 v2, s33, 2
+; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33
; GCN: s_mov_b32 s33, s32
; GFX900: buffer_store_dword
; GFX908-DAG: v_accvgpr_write_b32
; GCN: s_swappc_b64
; GFX900: buffer_load_dword
; GFX908: v_accvgpr_read_b32
-; GCN: v_readlane_b32 s33, v2, 2
+; GCN: s_mov_b32 s33, [[TMP_SGPR]]
define i64 @caller() {
bb:
call void asm sideeffect "", "~{v40}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index f34531b76feb..b591fd9f6c2f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -19,7 +19,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_writelane_b32 v255, s33, 2
+; GCN-NEXT: s_mov_b32 s6, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_i32 s32, s32, 0x7400
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill
@@ -264,7 +264,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00
-; GCN-NEXT: v_readlane_b32 s33, v255, 2
+; GCN-NEXT: s_mov_b32 s33, s6
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
@@ -311,7 +311,7 @@ define void @spill_to_lowest_available_vgpr() #0 {
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_writelane_b32 v254, s33, 2
+; GCN-NEXT: s_mov_b32 s6, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_i32 s32, s32, 0x7400
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill
@@ -554,7 +554,7 @@ define void @spill_to_lowest_available_vgpr() #0 {
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00
-; GCN-NEXT: v_readlane_b32 s33, v254, 2
+; GCN-NEXT: s_mov_b32 s33, s6
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index dd0c0d54e5bd..da4764f16e55 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -200,9 +200,10 @@ entry:
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword [[CSRV_1:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
-; GCN: v_writelane_b32 [[CSRV]], s33, 2
-; GCN-DAG: s_addk_i32 s32, 0x400
+; GCN: v_writelane_b32 [[CSRV_1]], s33, 0
+; GCN-DAG: s_addk_i32 s32, 0x800
; GCN-DAG: s_getpc_b64 s[4:5]
; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
@@ -226,10 +227,11 @@ entry:
; GCN-DAG: v_readlane_b32 s30, [[CSRV]], 0
; GCN-DAG: v_readlane_b32 s31, [[CSRV]], 1
-; GCN: s_addk_i32 s32, 0xfc00
+; GCN: s_addk_i32 s32, 0xf800
; GCN-NEXT: v_readlane_b32 s33,
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword [[CSRV_1]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
index 0b5109b7270b..e7249344c870 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
@@ -3,16 +3,18 @@
; GCN-LABEL: {{^}}spill_csr_s5_copy:
; GCN: s_or_saveexec_b64
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
-; GCN: v_writelane_b32 v40, s33, 3
+; GCN: v_writelane_b32 v41, s33, 0
; GCN: s_swappc_b64
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
-; GCN: v_readlane_b32 s33, v40, 3
+; GCN: v_readlane_b32 s33, v41, 0
; GCN: s_or_saveexec_b64
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN: s_mov_b64 exec
; GCN: s_setpc_b64
define void @spill_csr_s5_copy() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index b2c4444d7f84..c761a39f2d2f 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -157,12 +157,13 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill:
; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
+; GCN-NEXT: v_writelane_b32 [[VGPR_REG_1]], s33, 0
; GCN-DAG: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
; GCN: v_mov_b32_e32 v32, 0
-; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
+; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1
; GCN: s_mov_b32 s34, s32
; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -174,10 +175,11 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1
; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0
; GCN: s_add_i32 s32, s32, 0xfffd0000
-; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2
-; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
+; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG_1]], 0
+; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG_1]], 1
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN: s_setpc_b64 s[30:31]
%temp = alloca i32, align 1024, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
index fba11bf0124e..d46a597d8500 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
@@ -20,8 +20,8 @@ define amdgpu_gfx float @caller(float %arg0) {
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[34:35]
-; GCN-NEXT: v_writelane_b32 v1, s33, 3
; GCN-NEXT: v_writelane_b32 v1, s4, 0
+; GCN-NEXT: s_mov_b32 s36, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v1, s30, 1
@@ -36,7 +36,7 @@ define amdgpu_gfx float @caller(float %arg0) {
; GCN-NEXT: v_readlane_b32 s30, v1, 1
; GCN-NEXT: v_readlane_b32 s4, v1, 0
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v1, 3
+; GCN-NEXT: s_mov_b32 s33, s36
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index f945a797b774..d3a75eda7f2c 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -10,8 +10,9 @@ define internal fastcc void @widget() {
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_or_saveexec_b64 s[16:17], -1
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[16:17]
-; GFX90A-NEXT: v_writelane_b32 v40, s33, 2
+; GFX90A-NEXT: v_writelane_b32 v41, s33, 0
; GFX90A-NEXT: s_mov_b32 s33, s32
; GFX90A-NEXT: s_addk_i32 s32, 0x400
; GFX90A-NEXT: s_getpc_b64 s[16:17]
@@ -33,12 +34,12 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: s_mov_b64 s[54:55], s[6:7]
; GLOBALNESS1-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, v0
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v44, 0
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v0
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT: global_store_dword v[0:1], v44, off
+; GLOBALNESS1-NEXT: global_store_dword v[0:1], v40, off
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT: global_load_dword v0, v44, s[36:37]
+; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[36:37]
; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GLOBALNESS1-NEXT: s_mov_b64 s[64:65], s[4:5]
; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
@@ -46,10 +47,10 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17
; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, 0x40994400
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, 0x40994400
; GLOBALNESS1-NEXT: s_bitcmp1_b32 s38, 0
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[44:45]
+; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41]
; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0
; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0
; GLOBALNESS1-NEXT: s_xor_b64 s[94:95], s[4:5], -1
@@ -76,20 +77,20 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
-; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 0
-; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 1
+; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 0
+; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 1
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 2
-; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 2
+; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 3
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
-; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 4
+; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 4
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0
-; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 5
+; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 5
; GLOBALNESS1-NEXT: s_branch .LBB1_4
; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v41, 4
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v41, 5
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 4
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 5
; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29
; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6
@@ -137,10 +138,10 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1
; GLOBALNESS1-NEXT: ; Child Loop BB1_15 Depth 2
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1]
-; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1]
+; GLOBALNESS1-NEXT: flat_load_dword v44, v[0:1]
; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40
-; GLOBALNESS1-NEXT: buffer_store_dword v44, off, s[0:3], 0
-; GLOBALNESS1-NEXT: flat_load_dword v43, v[0:1]
+; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0
+; GLOBALNESS1-NEXT: flat_load_dword v45, v[0:1]
; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55]
@@ -148,7 +149,7 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: s_mov_b32 s12, s100
; GLOBALNESS1-NEXT: s_mov_b32 s13, s99
; GLOBALNESS1-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67]
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43]
@@ -218,19 +219,19 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[32:33], off
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v41, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v41, 1
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v42, 0
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v42, 1
; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[44:45], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off
; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43
-; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45
+; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc
; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[42:43]
; GLOBALNESS1-NEXT: s_mov_b32 s75, s39
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
@@ -279,9 +280,9 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: s_mov_b32 s12, s100
; GLOBALNESS1-NEXT: s_mov_b32 s13, s99
; GLOBALNESS1-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67]
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0
+; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55]
; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61]
@@ -289,15 +290,15 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: s_mov_b32 s12, s100
; GLOBALNESS1-NEXT: s_mov_b32 s13, s99
; GLOBALNESS1-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], a[32:33], off
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], a[32:33], off
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67]
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[40:41], off
; GLOBALNESS1-NEXT: s_branch .LBB1_13
; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -359,21 +360,21 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v41, 2
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v41, 3
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 2
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 3
; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off
; GLOBALNESS1-NEXT: s_branch .LBB1_1
; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off
; GLOBALNESS1-NEXT: s_branch .LBB1_2
; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard
; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
@@ -388,7 +389,7 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: s_mov_b32 s12, s100
; GLOBALNESS1-NEXT: s_mov_b32 s13, s99
; GLOBALNESS1-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43
; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17]
; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
@@ -406,7 +407,7 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS1-NEXT: s_mov_b32 s12, s100
; GLOBALNESS1-NEXT: s_mov_b32 s13, s99
; GLOBALNESS1-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43
; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17]
; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
@@ -418,12 +419,12 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[6:7]
; GLOBALNESS0-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, v0
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v44, 0
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT: global_store_dword v[0:1], v44, off
+; GLOBALNESS0-NEXT: global_store_dword v[0:1], v40, off
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT: global_load_dword v0, v44, s[36:37]
+; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[36:37]
; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[4:5]
; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
@@ -431,10 +432,10 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17
; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, 0x40994400
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, 0x40994400
; GLOBALNESS0-NEXT: s_bitcmp1_b32 s38, 0
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[44:45]
+; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41]
; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0
; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0
; GLOBALNESS0-NEXT: s_xor_b64 s[94:95], s[4:5], -1
@@ -461,20 +462,20 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
-; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 0
-; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 1
+; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 0
+; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 1
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 2
-; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 2
+; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 3
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
-; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 4
+; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 4
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0
-; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 5
+; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 5
; GLOBALNESS0-NEXT: s_branch .LBB1_4
; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 4
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 5
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 4
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 5
; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29
; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6
@@ -522,10 +523,10 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1
; GLOBALNESS0-NEXT: ; Child Loop BB1_15 Depth 2
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1]
-; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1]
+; GLOBALNESS0-NEXT: flat_load_dword v44, v[0:1]
; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40
-; GLOBALNESS0-NEXT: buffer_store_dword v44, off, s[0:3], 0
-; GLOBALNESS0-NEXT: flat_load_dword v43, v[0:1]
+; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0
+; GLOBALNESS0-NEXT: flat_load_dword v45, v[0:1]
; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55]
@@ -533,7 +534,7 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: s_mov_b32 s12, s100
; GLOBALNESS0-NEXT: s_mov_b32 s13, s99
; GLOBALNESS0-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67]
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43]
@@ -603,19 +604,19 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[32:33], off
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 0
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 1
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 0
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 1
; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[44:45], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off
; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43
-; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45
+; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc
; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[42:43]
; GLOBALNESS0-NEXT: s_mov_b32 s75, s39
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
@@ -664,9 +665,9 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: s_mov_b32 s12, s100
; GLOBALNESS0-NEXT: s_mov_b32 s13, s99
; GLOBALNESS0-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67]
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0
+; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55]
; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65]
@@ -674,15 +675,15 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: s_mov_b32 s12, s100
; GLOBALNESS0-NEXT: s_mov_b32 s13, s99
; GLOBALNESS0-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], a[32:33], off
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], a[32:33], off
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67]
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13
; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[40:41], off
; GLOBALNESS0-NEXT: s_branch .LBB1_13
; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -744,21 +745,21 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 2
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 3
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 2
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 3
; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off
; GLOBALNESS0-NEXT: s_branch .LBB1_1
; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off
; GLOBALNESS0-NEXT: s_branch .LBB1_2
; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard
; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5]
@@ -773,7 +774,7 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: s_mov_b32 s12, s100
; GLOBALNESS0-NEXT: s_mov_b32 s13, s99
; GLOBALNESS0-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43
; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17]
; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
@@ -791,7 +792,7 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
; GLOBALNESS0-NEXT: s_mov_b32 s12, s100
; GLOBALNESS0-NEXT: s_mov_b32 s13, s99
; GLOBALNESS0-NEXT: s_mov_b32 s14, s98
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43
; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17]
; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 48e7db518fea..f3e434fd5653 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -8,8 +8,9 @@ define hidden void @widget() {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 16
+; GCN-NEXT: v_writelane_b32 v42, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -119,9 +120,10 @@ define hidden void @widget() {
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: v_readlane_b32 s33, v40, 16
+; GCN-NEXT: v_readlane_b32 s33, v42, 0
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -263,8 +265,9 @@ define hidden void @blam() {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: v_writelane_b32 v40, s33, 18
+; GCN-NEXT: v_writelane_b32 v46, s33, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_addk_i32 s32, 0x800
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index ed0de729dafd..c8322e334ca9 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -13,8 +13,9 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v45, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: v_mov_b32_e32 v36, v16
; GFX9-NEXT: v_mov_b32_e32 v35, v15
@@ -54,9 +55,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v45, 0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -67,6 +69,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: v_mov_b32_e32 v36, v16
@@ -74,7 +77,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: v_mov_b32_e32 v34, v14
; GFX10-NEXT: v_mov_b32_e32 v33, v13
; GFX10-NEXT: v_mov_b32_e32 v32, v12
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v45, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
@@ -111,9 +114,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v45, 0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16
+; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -124,12 +129,14 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
; GFX11-NEXT: v_mov_b32_e32 v32, v12
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v45, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12
@@ -164,9 +171,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v45, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -200,8 +209,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v46, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
@@ -235,9 +245,10 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v46, 0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -248,9 +259,10 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v46, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
@@ -285,9 +297,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v46, 0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20
+; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -298,9 +312,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:24
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v46, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16
@@ -333,9 +349,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v46, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:24
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 98f13357111b..ed6daca96718 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1112,11 +1112,12 @@ declare void @external_void_func_void() #1
; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]+]], -1{{$}}
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 v40, s33, 2
+; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN: s_mov_b32 s33, s32
; GFX1064: s_addk_i32 s32, 0x400
; GFX1032: s_addk_i32 s32, 0x200
@@ -1131,10 +1132,12 @@ declare void @external_void_func_void() #1
; GFX1064: s_addk_i32 s32, 0xfc00
; GFX1032: s_addk_i32 s32, 0xfe00
-; GCN: v_readlane_b32 s33, v40, 2
+; GCN: v_readlane_b32 s33, v41, 0
; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index fa87adb73512..9c9f36805008 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -337,7 +337,7 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2
+; GFX9-O0-NEXT: s_mov_b32 s35, s33
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
@@ -371,7 +371,7 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2
+; GFX9-O0-NEXT: s_mov_b32 s33, s35
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -388,8 +388,8 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_writelane_b32 v3, s33, 2
; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-O3-NEXT: s_mov_b32 s38, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_addk_i32 s32, 0x400
; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1
@@ -411,7 +411,7 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-O3-NEXT: v_readlane_b32 s33, v3, 2
+; GFX9-O3-NEXT: s_mov_b32 s33, s38
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -532,7 +532,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_writelane_b32 v10, s33, 8
+; GFX9-O0-NEXT: s_mov_b32 s42, s33
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00
; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0
@@ -599,7 +599,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1
; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400
-; GFX9-O0-NEXT: v_readlane_b32 s33, v10, 8
+; GFX9-O0-NEXT: s_mov_b32 s33, s42
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
@@ -639,8 +639,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_writelane_b32 v8, s33, 2
; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0
+; GFX9-O3-NEXT: s_mov_b32 s40, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1
@@ -672,7 +672,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1
; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT: v_readlane_b32 s33, v8, 2
+; GFX9-O3-NEXT: s_mov_b32 s33, s40
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_nop 0
More information about the llvm-commits
mailing list