[llvm] [AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init (PR #79586)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 8 07:30:38 PST 2024
https://github.com/alex-t updated https://github.com/llvm/llvm-project/pull/79586
>From 099d17c53c006ed3931f56f94b74c70936e0c366 Mon Sep 17 00:00:00 2001
From: Alexander <alexander.timofeev at amd.com>
Date: Tue, 16 Jan 2024 22:08:07 +0100
Subject: [PATCH 1/9] SWDEV-409366 WIP
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 47 +++++++++++++++-------
llvm/lib/Target/AMDGPU/SIFrameLowering.h | 9 ++++-
2 files changed, 39 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0f89df1444866..e878c4dc06ac9 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -610,11 +610,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const Function &F = MF.getFunction();
+ // const Function &F = MF.getFunction();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
assert(MFI->isEntryFunction());
+ bool NeedsFlatScratchInit =
+ MFI->getUserSGPRInfo().hasFlatScratchInit() &&
+ (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+ (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
@@ -635,12 +640,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
OtherBB.addLiveIn(ScratchRsrcReg);
}
}
+ } else {
+ ScratchRsrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
}
// Now that we have fixed the reserved SRSRC we need to locate the
// (potentially) preloaded SRSRC.
Register PreloadedScratchRsrcReg;
- if (ST.isAmdHsaOrMesa(F)) {
+ if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
PreloadedScratchRsrcReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
@@ -696,10 +703,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- bool NeedsFlatScratchInit =
- MFI->getUserSGPRInfo().hasFlatScratchInit() &&
- (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
- (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
@@ -712,17 +715,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
if (ScratchRsrcReg) {
- emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
- PreloadedScratchRsrcReg,
- ScratchRsrcReg, ScratchWaveOffsetReg);
+ emitEntryFunctionScratchRsrcRegSetup(
+ MF, MBB, I, DL, NeedsFlatScratchInit, ScratchRsrcReg,
+ PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
}
}
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, Register PreloadedScratchRsrcReg,
- Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+ const DebugLoc &DL, bool HasFlatScratchInit, Register ScratchRsrcReg,
+ Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -829,11 +832,25 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else if (ST.isAmdHsaOrMesa(Fn)) {
- assert(PreloadedScratchRsrcReg);
- if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
- .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+ if (HasFlatScratchInit) {
+ if (Register FlatScratchReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)) {
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+ .addReg(FlatScratchReg)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
+ .addImm(0xf0000000)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ }
+ return;
+ } else {
+ assert(PreloadedScratchRsrcReg);
+
+ if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed811..cab4c81ac6e2f 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -78,8 +78,13 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
void emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
- Register ScratchWaveOffsetReg) const;
+ bool HasFlatScratchInit, Register ScratchRsrcReg,
+ Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
+ // void emitEntryFunctionScratchRsrcRegSetup(
+ // MachineFunction &MF, MachineBasicBlock &MBB,
+ // MachineBasicBlock::iterator I, const DebugLoc &DL,
+ // Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
+ // Register ScratchWaveOffsetReg) const;
public:
bool hasFP(const MachineFunction &MF) const override;
>From cc479a91b50465d37967e521706473cd2a0b1759 Mon Sep 17 00:00:00 2001
From: Alexander <alexander.timofeev at amd.com>
Date: Tue, 16 Jan 2024 22:08:07 +0100
Subject: [PATCH 2/9] SWDEV-409366 WIP
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index e878c4dc06ac9..dbc4959f1e657 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -773,7 +773,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(21)
.addReg(Rsrc03);
}
- } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
+ } else if (ST.isMesaGfxShader(Fn) ||
+ (!HasFlatScratchInit && !PreloadedScratchRsrcReg)) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
>From cefab2497206f2e8d75f70067c325c7b8d6b57d6 Mon Sep 17 00:00:00 2001
From: Alexander <alexander.timofeev at amd.com>
Date: Tue, 16 Jan 2024 22:08:07 +0100
Subject: [PATCH 3/9] SWDEV-409366 WIP
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 20 ++++++++------------
llvm/lib/Target/AMDGPU/SIFrameLowering.h | 5 -----
2 files changed, 8 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index dbc4959f1e657..6ad9c604095f9 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -610,7 +610,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- // const Function &F = MF.getFunction();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
assert(MFI->isEntryFunction());
@@ -640,8 +639,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
OtherBB.addLiveIn(ScratchRsrcReg);
}
}
- } else {
- ScratchRsrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
}
// Now that we have fixed the reserved SRSRC we need to locate the
@@ -835,15 +832,14 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
} else if (ST.isAmdHsaOrMesa(Fn)) {
if (HasFlatScratchInit) {
- if (Register FlatScratchReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)) {
- I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
- .addReg(FlatScratchReg)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
- .addImm(0xf0000000)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- }
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
+ TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+ .addReg(AMDGPU::FLAT_SCR)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
+ TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
+ .addImm(0xf0000000)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
return;
} else {
assert(PreloadedScratchRsrcReg);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index cab4c81ac6e2f..dc0b4acf6ddae 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -80,11 +80,6 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
MachineBasicBlock::iterator I, const DebugLoc &DL,
bool HasFlatScratchInit, Register ScratchRsrcReg,
Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
- // void emitEntryFunctionScratchRsrcRegSetup(
- // MachineFunction &MF, MachineBasicBlock &MBB,
- // MachineBasicBlock::iterator I, const DebugLoc &DL,
- // Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
- // Register ScratchWaveOffsetReg) const;
public:
bool hasFP(const MachineFunction &MF) const override;
>From cbecf53314c6d969d6590a62edcf651718a73105 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Wed, 24 Jan 2024 21:39:45 +0100
Subject: [PATCH 4/9] SWDEV-409366 WIP 24 Jan
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 40 +--
llvm/lib/Target/AMDGPU/SIFrameLowering.h | 12 +-
.../GlobalISel/call-outgoing-stack-args.ll | 8 +-
.../abi-attribute-hints-undefined-behavior.ll | 16 +-
...der-no-live-segment-at-def-implicit-def.ll | 4 +-
.../branch-folding-implicit-def-subreg.ll | 6 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 280 +++++++++---------
.../CodeGen/AMDGPU/call-reqd-group-size.ll | 24 +-
llvm/test/CodeGen/AMDGPU/call-waitcnt.ll | 24 +-
.../AMDGPU/callee-special-input-vgprs.ll | 6 +-
llvm/test/CodeGen/AMDGPU/cc-update.ll | 64 ++--
.../AMDGPU/cross-block-use-is-not-abi-copy.ll | 8 +-
.../AMDGPU/indirect-call-known-callees.ll | 8 +-
llvm/test/CodeGen/AMDGPU/indirect-call.ll | 16 +-
.../kernel-vgpr-spill-mubuf-with-voffset.ll | 4 +-
llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 48 +--
.../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 4 +-
.../AMDGPU/lower-module-lds-via-hybrid.ll | 12 +-
.../AMDGPU/lower-module-lds-via-table.ll | 12 +-
...ne-sink-temporal-divergence-swdev407790.ll | 12 +-
.../AMDGPU/need-fp-from-vgpr-spills.ll | 12 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 4 +-
.../AMDGPU/tuple-allocation-failure.ll | 12 +-
.../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 4 +-
24 files changed, 322 insertions(+), 318 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6ad9c604095f9..4842cf83af0e1 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -378,7 +378,7 @@ class PrologEpilogSGPRSpillBuilder {
} // namespace llvm
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
-void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -398,6 +398,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
Register FlatScrInitLo;
Register FlatScrInitHi;
+ Register FlatScratchInitReg = AMDGPU::NoRegister;
if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
@@ -407,7 +408,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
- Register FlatScrInit = AMDGPU::NoRegister;
+ Register FlatScratchInitReg = AMDGPU::NoRegister;
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
AllSGPR64s = AllSGPR64s.slice(
@@ -416,16 +417,16 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
for (MCPhysReg Reg : AllSGPR64s) {
if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
- FlatScrInit = Reg;
+ FlatScratchInitReg = Reg;
break;
}
}
- assert(FlatScrInit && "Failed to find free register for scratch init");
+ assert(FlatScratchInitReg && "Failed to find free register for scratch init");
- FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
- FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
+ FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
- buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+ buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).
@@ -440,8 +441,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
- BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
- .addReg(FlatScrInit)
+ BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
+ .addReg(FlatScratchInitReg)
.addImm(EncodedOffset) // offset
.addImm(0) // cpol
.addMemOperand(MMO);
@@ -453,7 +454,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(0xffff);
And->getOperand(3).setIsDead(); // Mark SCC as dead.
} else {
- Register FlatScratchInitReg =
+ FlatScratchInitReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
assert(FlatScratchInitReg);
@@ -485,7 +486,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
addReg(FlatScrInitHi).
addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
(31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
- return;
+ return FlatScratchInitReg;
}
// For GFX9.
@@ -498,7 +499,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(0);
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
- return;
+ return AMDGPU::FLAT_SCR;
}
assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
@@ -519,6 +520,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addReg(FlatScrInitLo, RegState::Kill)
.addImm(8);
LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
+ return AMDGPU::FLAT_SCR;
}
// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -707,13 +709,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
+ Register FlatScratchInit = AMDGPU::NoRegister;
if (NeedsFlatScratchInit) {
- emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+ FlatScratchInit =
+ emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}
if (ScratchRsrcReg) {
emitEntryFunctionScratchRsrcRegSetup(
- MF, MBB, I, DL, NeedsFlatScratchInit, ScratchRsrcReg,
+ MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
}
}
@@ -721,7 +725,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, bool HasFlatScratchInit, Register ScratchRsrcReg,
+ const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -771,7 +775,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addReg(Rsrc03);
}
} else if (ST.isMesaGfxShader(Fn) ||
- (!HasFlatScratchInit && !PreloadedScratchRsrcReg)) {
+ (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -831,10 +835,10 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else if (ST.isAmdHsaOrMesa(Fn)) {
- if (HasFlatScratchInit) {
+ if (FlatScratchInit) {
I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
- .addReg(AMDGPU::FLAT_SCR)
+ .addReg(FlatScratchInit)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index dc0b4acf6ddae..f706d48b2dc10 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -67,18 +67,18 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
MachineBasicBlock::iterator MI) const override;
private:
- void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const DebugLoc &DL,
- Register ScratchWaveOffsetReg) const;
+ Register
+ emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register ScratchWaveOffsetReg) const;
Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
void emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- bool HasFlatScratchInit, Register ScratchRsrcReg,
+ Register FlatScratchInit, Register ScratchRsrcReg,
Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
public:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a6..a7277414391cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -14,9 +14,9 @@ define amdgpu_kernel void @kernel_caller_stack() {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_add_u32 s0, s0, s7
+; MUBUF-NEXT: s_mov_b64 s[2:3], 0xf0000000
; MUBUF-NEXT: s_mov_b32 s32, 0
-; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
@@ -62,8 +62,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_add_u32 s0, s0, s7
-; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index a439c0f51ffe9..bda25cda4c5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,19 +48,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
-; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT: s_mov_b64 s[2:3], 0xf0000000
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6
; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7
; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
@@ -70,19 +70,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
-; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT: s_mov_b64 s[2:3], 0xf0000000
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6
; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7
; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 7c8d40c49bb80..20f60d1db7fb5 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,8 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5a128c7541d1e..2baaefb76acb3 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,13 +5,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-LABEL: name: f1
; GFX90A: bb.0.bb:
; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
+ ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0
; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
- ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr2_sgpr3 = S_MOV_B64 4026531840, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a192a1b8dff93..0fe54349215ba 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,12 +129,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i1_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
@@ -234,8 +234,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
@@ -339,8 +339,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
@@ -422,12 +422,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_i8_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
@@ -525,8 +525,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
@@ -625,8 +625,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
@@ -707,12 +707,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
@@ -809,8 +809,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
@@ -909,8 +909,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
@@ -991,12 +991,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 42
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
@@ -1078,13 +1078,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: v_mov_b32_e32 v1, 0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
@@ -1182,12 +1182,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_mov_b32 s4, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
@@ -1278,15 +1278,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2i64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
@@ -1391,12 +1391,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_mov_b32 s4, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v4, 1
; HSA-NEXT: v_mov_b32_e32 v5, 2
; HSA-NEXT: s_mov_b32 s32, 0
@@ -1514,12 +1514,12 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_mov_b32 s4, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v4, 1
; HSA-NEXT: v_mov_b32_e32 v5, 2
; HSA-NEXT: v_mov_b32_e32 v6, 3
@@ -1605,12 +1605,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x4400
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
@@ -1689,12 +1689,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 4.0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
@@ -1776,13 +1776,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
@@ -1868,14 +1868,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 4.0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
@@ -1968,16 +1968,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v5f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 4.0
; HSA-NEXT: v_mov_b32_e32 v3, -1.0
; HSA-NEXT: v_mov_b32_e32 v4, 0.5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
@@ -2059,13 +2059,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
@@ -2154,15 +2154,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 0
; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
@@ -2258,9 +2258,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 0
@@ -2268,7 +2269,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; HSA-NEXT: v_mov_b32_e32 v4, 0
; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
@@ -2357,14 +2357,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; HSA-LABEL: test_call_external_void_func_v2i16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
@@ -2456,14 +2456,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; HSA-LABEL: test_call_external_void_func_v3i16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
@@ -2556,14 +2556,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; HSA-LABEL: test_call_external_void_func_v3f16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
@@ -2647,13 +2647,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x20001
; HSA-NEXT: v_mov_b32_e32 v1, 3
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
@@ -2737,13 +2737,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00
; HSA-NEXT: v_mov_b32_e32 v1, 0x4400
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
@@ -2835,14 +2835,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; HSA-LABEL: test_call_external_void_func_v4i16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
@@ -2928,13 +2928,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v4i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x20001
; HSA-NEXT: v_mov_b32_e32 v1, 0x40003
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
@@ -3025,14 +3025,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; HSA-LABEL: test_call_external_void_func_v2f16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
@@ -3120,14 +3120,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; HSA-LABEL: test_call_external_void_func_v2i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
@@ -3210,13 +3210,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
@@ -3302,14 +3302,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_v3i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: v_mov_b32_e32 v1, 4
; HSA-NEXT: v_mov_b32_e32 v2, 5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
@@ -3398,15 +3398,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; HSA-LABEL: test_call_external_void_func_v3i32_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: v_mov_b32_e32 v1, 4
; HSA-NEXT: v_mov_b32_e32 v2, 5
; HSA-NEXT: v_mov_b32_e32 v3, 6
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
@@ -3493,14 +3493,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; HSA-LABEL: test_call_external_void_func_v4i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
@@ -3590,15 +3590,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v4i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
@@ -3691,16 +3691,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v5i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: v_mov_b32_e32 v4, 5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
@@ -3803,13 +3803,13 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
@@ -3915,9 +3915,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v8i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
@@ -3927,7 +3928,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; HSA-NEXT: v_mov_b32_e32 v6, 7
; HSA-NEXT: v_mov_b32_e32 v7, 8
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
@@ -4038,7 +4038,6 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4046,7 +4045,8 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
@@ -4183,7 +4183,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4195,8 +4194,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
@@ -4359,9 +4359,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT: s_add_u32 s0, s0, s9
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
@@ -4468,14 +4468,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
;
; HSA-LABEL: test_call_external_i32_func_i32_imm:
; HSA: ; %bb.0:
-; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
+; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 42
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_mov_b32 s39, 0x1100f000
; HSA-NEXT: s_mov_b32 s38, -1
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -4583,13 +4583,13 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; HSA-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
@@ -4704,9 +4704,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8
; HSA-NEXT: v_mov_b32_e32 v0, 8
@@ -4714,7 +4715,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
; HSA-NEXT: s_movk_i32 s32, 0x400
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
@@ -4879,9 +4879,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8
; HSA-NEXT: v_mov_b32_e32 v0, 8
@@ -4889,7 +4890,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
; HSA-NEXT: s_movk_i32 s32, 0x800
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
@@ -5087,10 +5087,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; HSA-NEXT: s_mov_b32 s32, 0
@@ -5341,14 +5341,14 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64:
; HSA: ; %bb.0: ; %entry
; HSA-NEXT: s_add_i32 s6, s6, s9
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT: s_add_u32 s0, s0, s9
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; HSA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x80
; HSA-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: v_mov_b32_e32 v0, s23
; HSA-NEXT: v_mov_b32_e32 v1, s6
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index c62a082459105..efa412d6b0a32 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -32,8 +32,8 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -51,8 +51,8 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -70,8 +70,8 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -89,8 +89,8 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -109,8 +109,8 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_mov_b32_e32 v31, 0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 616e5f00fc1e5..8e4baf4a8e5a4 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -8,8 +8,8 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: ds_read_b32 v0, v0
@@ -31,9 +31,9 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v0, v0, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
@@ -52,11 +52,11 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -74,11 +74,11 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call_return_val:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -99,12 +99,12 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, got.func at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, got.func at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 6d603ef039769..49bf48a3687c9 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -165,7 +165,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI: v_or_b32_e32 v31, v0, v1
; FIXEDABI-NOT: v0
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
@@ -181,7 +181,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI: v_or_b32_e32 v31, v0, v1
; FIXEDABI-NOT: v0
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
; FIXEDABI-NOT: v2
; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2
; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0
+; FIXEDABI: v_or_b32_e32 v31, v1, v0
; FIXEDABI-NOT: v0
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index ca09163b20afc..98f60a76a9e82 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -68,13 +68,13 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s15
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -89,10 +89,10 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s15
+; GFX900-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
+; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -112,8 +112,8 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: s_add_u32 s0, s0, s15
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
@@ -148,13 +148,13 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s15
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
@@ -172,10 +172,10 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s15
+; GFX900-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
+; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
@@ -199,8 +199,8 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
-; GFX1010-NEXT: s_add_u32 s0, s0, s15
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -311,13 +311,13 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s15
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -333,10 +333,10 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s15
+; GFX900-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
+; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -358,8 +358,8 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: s_add_u32 s0, s0, s15
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
@@ -413,14 +413,14 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX803-LABEL: test_force_fp_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s15
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 s33, 0
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
@@ -438,11 +438,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s15
+; GFX900-NEXT: s_mov_b64 s[2:3], 0xf0000000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_mov_b32 s33, 0
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
+; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
@@ -467,8 +467,8 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
-; GFX1010-NEXT: s_add_u32 s0, s0, s15
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 11871db1ef656..408d3b2c4c229 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -180,8 +180,8 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT: s_add_u32 s0, s0, s15
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s8, 0
@@ -229,8 +229,8 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT: s_add_u32 s0, s0, s15
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index fe7323eeadf8a..955c0605f0536 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -12,8 +12,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s7
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_load_dword s7, s[4:5], 0x0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -24,14 +22,16 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; CHECK-NEXT: s_addc_u32 s9, s9, snork at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
-; CHECK-NEXT: s_mov_b64 s[8:9], 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_and_b32 s4, 1, s7
; CHECK-NEXT: s_cmp_eq_u32 s4, 1
-; CHECK-NEXT: v_mov_b32_e32 v31, v0
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_cselect_b32 s5, s13, s11
; CHECK-NEXT: s_cselect_b32 s4, s12, s10
+; CHECK-NEXT: s_mov_b64 s[8:9], 0
; CHECK-NEXT: s_mov_b32 s12, s6
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 3aaf04c94cda5..594f3b33bfb36 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -12,8 +12,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_getpc_b64 s[14:15]
@@ -37,8 +37,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GISEL-NEXT: s_add_i32 s12, s12, s17
; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT: s_add_u32 s0, s0, s17
-; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; GISEL-NEXT: s_mov_b32 s13, s15
; GISEL-NEXT: s_mov_b32 s12, s14
; GISEL-NEXT: s_getpc_b64 s[14:15]
@@ -67,8 +67,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_getpc_b64 s[14:15]
@@ -93,8 +93,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GISEL-NEXT: s_add_i32 s12, s12, s17
; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT: s_add_u32 s0, s0, s17
-; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; GISEL-NEXT: s_mov_b32 s13, s15
; GISEL-NEXT: s_mov_b32 s12, s14
; GISEL-NEXT: s_getpc_b64 s[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 6e905542ce53c..4944a6db486ba 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -11,8 +11,8 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_mov_b32 s33, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 66f31bbf7afe0..96dfb1fbf8017 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -118,10 +118,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -177,10 +177,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -236,10 +236,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -295,10 +295,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -341,8 +341,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -351,6 +349,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 0
; CHECK-NEXT: ds_write_b16 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -370,14 +370,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -410,8 +410,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -420,6 +418,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 2
; CHECK-NEXT: ds_write_b16 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -439,14 +439,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -479,8 +479,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -489,6 +487,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 1
; CHECK-NEXT: ds_write_b16 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -508,14 +508,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -548,8 +548,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -558,6 +556,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 3
; CHECK-NEXT: ds_write_b16 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -577,14 +577,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 61818dafd2b84..5c33d50382174 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, function_lds_id at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 41551d5fb9060..34ca2793997a4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -164,8 +164,8 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f0 at gotpcrel32@lo+4
@@ -198,8 +198,8 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f2 at gotpcrel32@lo+4
@@ -240,8 +240,8 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s[2:3], 0xf0000000
+; GCN-NEXT: s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 38d6039670ab4..9029e433700f5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -229,8 +229,8 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f0 at gotpcrel32@lo+4
@@ -268,8 +268,8 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f2 at gotpcrel32@lo+4
@@ -310,8 +310,8 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 1c75a2fc3dce6..e88b76b30197c 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -44,17 +44,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v40, v0
; CHECK-NEXT: s_add_u32 s42, s34, 40
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_addc_u32 s43, s35, 0
-; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s33, s14
; CHECK-NEXT: s_mov_b32 s40, s13
; CHECK-NEXT: s_mov_b32 s41, s12
@@ -766,17 +766,17 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx2 s[46:47], s[6:7], 0x10
-; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7]
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v40, v0
; CHECK-NEXT: s_add_u32 s42, s36, 40
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_addc_u32 s43, s37, 0
-; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s33, s14
; CHECK-NEXT: s_mov_b32 s40, s13
; CHECK-NEXT: s_mov_b32 s41, s12
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index f70441e87a74b..46b69537798e5 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -69,8 +69,8 @@ define amdgpu_kernel void @kernel_call() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[16:17]
@@ -128,8 +128,8 @@ define amdgpu_kernel void @kernel_tailcall() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[16:17]
@@ -240,8 +240,8 @@ define protected amdgpu_kernel void @kernel() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index dcc90c0dcd407..92fbbdb9c4e87 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: s_add_u32 s0, s0, s17
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
; GFX9-NEXT: s_mul_i32 s4, s4, s5
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index dd8ff64a4eec2..9510b400d1d6c 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -45,10 +45,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18
; GLOBALNESS1-NEXT: s_load_dword s7, s[38:39], 0x20
; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s15
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400
; GLOBALNESS1-NEXT: s_bitcmp1_b32 s74, 0
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
@@ -78,6 +76,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3
+; GLOBALNESS1-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GLOBALNESS1-NEXT: s_mov_b64 s[0:1], flat_scratch
; GLOBALNESS1-NEXT: s_mov_b32 s68, s14
; GLOBALNESS1-NEXT: s_mov_b32 s69, s13
; GLOBALNESS1-NEXT: s_mov_b32 s70, s12
@@ -327,10 +327,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18
; GLOBALNESS0-NEXT: s_load_dword s7, s[38:39], 0x20
; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s15
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400
; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
@@ -360,6 +358,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3
+; GLOBALNESS0-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GLOBALNESS0-NEXT: s_mov_b64 s[0:1], flat_scratch
; GLOBALNESS0-NEXT: s_mov_b32 s66, s14
; GLOBALNESS0-NEXT: s_mov_b32 s67, s13
; GLOBALNESS0-NEXT: s_mov_b32 s68, s12
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index 7840559c78eb6..c8ca99926eab4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -14,8 +14,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
; CHECK-NEXT: s_addc_u32 s11, s11, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
; CHECK-NEXT: v_mov_b32_e32 v5, 42
>From 10de7a0ce6eb332048477f50dd54c9ba49a1a664 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Fri, 2 Feb 2024 19:49:58 +0100
Subject: [PATCH 5/9] [AMDGPU] Compiler should synthesize private buffer
resource descriptor from flat_scratch_init
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 32 ++--
.../GlobalISel/call-outgoing-stack-args.ll | 6 +-
.../abi-attribute-hints-undefined-behavior.ll | 6 +-
...der-no-live-segment-at-def-implicit-def.ll | 3 +-
.../branch-folding-implicit-def-subreg.ll | 3 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 149 ++++++++++++------
.../CodeGen/AMDGPU/call-reqd-group-size.ll | 18 ++-
llvm/test/CodeGen/AMDGPU/call-waitcnt.ll | 17 +-
llvm/test/CodeGen/AMDGPU/cc-update.ll | 44 ++++--
.../AMDGPU/cross-block-use-is-not-abi-copy.ll | 6 +-
.../AMDGPU/indirect-call-known-callees.ll | 3 +-
llvm/test/CodeGen/AMDGPU/indirect-call.ll | 12 +-
.../kernel-vgpr-spill-mubuf-with-voffset.ll | 3 +-
llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 36 +++--
.../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 3 +-
.../AMDGPU/lower-module-lds-via-hybrid.ll | 11 +-
.../AMDGPU/lower-module-lds-via-table.ll | 9 +-
...ne-sink-temporal-divergence-swdev407790.ll | 6 +-
.../AMDGPU/need-fp-from-vgpr-spills.ll | 9 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 7 +-
.../AMDGPU/tuple-allocation-failure.ll | 6 +-
.../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 3 +-
22 files changed, 256 insertions(+), 136 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 4842cf83af0e1..f5c3efecd7316 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -398,7 +398,7 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
Register FlatScrInitLo;
Register FlatScrInitHi;
- Register FlatScratchInitReg = AMDGPU::NoRegister;
+ Register FlatScratchInitReg;
if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
@@ -408,7 +408,6 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
- Register FlatScratchInitReg = AMDGPU::NoRegister;
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
AllSGPR64s = AllSGPR64s.slice(
@@ -709,7 +708,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
- Register FlatScratchInit = AMDGPU::NoRegister;
+ Register FlatScratchInit;
if (NeedsFlatScratchInit) {
FlatScratchInit =
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
@@ -836,22 +835,29 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
} else if (ST.isAmdHsaOrMesa(Fn)) {
if (FlatScratchInit) {
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+ Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+ uint64_t Rsrc23 = TII->getScratchRsrcWords23();
I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
.addReg(FlatScratchInit)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
- TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
- .addImm(0xf0000000)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+ .addImm(Rsrc23 & 0xffffffff)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+ .addImm(Rsrc23 >> 32)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
return;
- } else {
- assert(PreloadedScratchRsrcReg);
+ }
- if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
- .addReg(PreloadedScratchRsrcReg, RegState::Kill);
- }
+ assert(PreloadedScratchRsrcReg);
+
+ if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedScratchRsrcReg, RegState::Kill);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index a7277414391cb..6e49a5a4ec0e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -13,8 +13,9 @@ define amdgpu_kernel void @kernel_caller_stack() {
; MUBUF-LABEL: kernel_caller_stack:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT: s_mov_b32 s2, -1
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; MUBUF-NEXT: s_mov_b32 s32, 0
; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
@@ -61,8 +62,9 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF-LABEL: kernel_caller_byval:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT: s_mov_b32 s2, -1
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index bda25cda4c5f9..609b5e6f49ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,10 +48,11 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
+; FIXEDABI-SDAG-NEXT: s_mov_b32 s2, -1
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-SDAG-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; FIXEDABI-SDAG-NEXT: s_mov_b32 s3, 0x11e80000
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch
@@ -70,10 +71,11 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
+; FIXEDABI-GISEL-NEXT: s_mov_b32 s2, -1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-GISEL-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; FIXEDABI-GISEL-NEXT: s_mov_b32 s3, 0x11e80000
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 20f60d1db7fb5..74c6bb599cb9b 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,7 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b32 s8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 2baaefb76acb3..c06f213b9eb66 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -10,7 +10,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0
; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
- ; GFX90A-NEXT: $sgpr2_sgpr3 = S_MOV_B64 4026531840, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 0fe54349215ba..c8eb797a0081a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,9 +129,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i1_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: s_mov_b32 s32, 0
@@ -234,7 +235,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -339,7 +341,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -422,9 +425,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_i8_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: s_mov_b32 s32, 0
@@ -525,7 +529,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -625,7 +630,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -707,9 +713,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: s_mov_b32 s32, 0
@@ -809,7 +816,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -909,7 +917,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -991,9 +1000,10 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 42
; HSA-NEXT: s_mov_b32 s32, 0
@@ -1078,9 +1088,10 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: v_mov_b32_e32 v1, 0
@@ -1186,7 +1197,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -1278,9 +1290,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2i64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
@@ -1395,7 +1408,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v4, 1
; HSA-NEXT: v_mov_b32_e32 v5, 2
@@ -1518,7 +1532,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v4, 1
; HSA-NEXT: v_mov_b32_e32 v5, 2
@@ -1605,9 +1620,10 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x4400
; HSA-NEXT: s_mov_b32 s32, 0
@@ -1689,9 +1705,10 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 4.0
; HSA-NEXT: s_mov_b32 s32, 0
@@ -1776,9 +1793,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
@@ -1868,9 +1886,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
@@ -1968,9 +1987,10 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v5f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2059,9 +2079,10 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000
@@ -2154,9 +2175,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2258,9 +2280,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2360,9 +2383,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -2459,9 +2483,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -2559,9 +2584,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -2647,9 +2673,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x20001
; HSA-NEXT: v_mov_b32_e32 v1, 3
@@ -2737,9 +2764,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00
; HSA-NEXT: v_mov_b32_e32 v1, 0x4400
@@ -2838,9 +2866,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -2928,9 +2957,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v4i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x20001
; HSA-NEXT: v_mov_b32_e32 v1, 0x40003
@@ -3028,9 +3058,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -3123,9 +3154,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -3210,9 +3242,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
@@ -3302,9 +3335,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_v3i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: v_mov_b32_e32 v1, 4
@@ -3398,9 +3432,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; HSA-LABEL: test_call_external_void_func_v3i32_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: v_mov_b32_e32 v1, 4
@@ -3496,9 +3531,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -3590,9 +3626,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v4i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
@@ -3691,9 +3728,10 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v5i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
@@ -3808,7 +3846,8 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -3915,9 +3954,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v8i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
@@ -4045,7 +4085,8 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -4194,7 +4235,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_getpc_b64 s[8:9]
@@ -4359,7 +4401,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -4470,9 +4513,10 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; HSA: ; %bb.0:
; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 42
; HSA-NEXT: s_mov_b32 s32, 0
@@ -4588,7 +4632,8 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; HSA-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -4704,9 +4749,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8
@@ -4879,9 +4925,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8
@@ -5089,10 +5136,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
-; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
@@ -5346,7 +5394,8 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; HSA-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; HSA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x80
; HSA-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index efa412d6b0a32..8e2fca554e28c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -11,8 +11,9 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-LABEL: known_x_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
@@ -31,8 +32,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-LABEL: known_y_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0
; CHECK-NEXT: s_mov_b32 s32, 0
@@ -50,8 +52,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-LABEL: known_z_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
; CHECK-NEXT: s_mov_b32 s32, 0
@@ -69,8 +72,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK-LABEL: known_yz_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: s_mov_b32 s32, 0
@@ -88,8 +92,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK-LABEL: known_xz_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1
; CHECK-NEXT: s_mov_b32 s32, 0
@@ -108,8 +113,9 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
; CHECK-LABEL: known_xyz_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_mov_b32_e32 v31, 0
; CHECK-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 8e4baf4a8e5a4..6db5effdf04ed 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -7,12 +7,13 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
-; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func at rel32@lo+4
@@ -30,8 +31,9 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -54,8 +56,9 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
@@ -76,8 +79,9 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
@@ -103,7 +107,8 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
; GCN-NEXT: s_add_u32 s4, s4, got.func at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, got.func at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 98f60a76a9e82..42beb1c8ae256 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -68,10 +68,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
+; GFX803-NEXT: s_mov_b32 s2, -1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX803-NEXT: s_mov_b32 s3, 0x11e80000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
@@ -88,8 +89,9 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX900-LABEL: test_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT: s_mov_b32 s2, -1
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX900-NEXT: s_mov_b32 s3, 0xe00000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
@@ -112,11 +114,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT: s_mov_b32 s2, -1
+; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000
; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: s_getpc_b64 s[16:17]
; GFX1010-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
@@ -148,10 +151,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
+; GFX803-NEXT: s_mov_b32 s2, -1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX803-NEXT: s_mov_b32 s3, 0x11e80000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
@@ -171,8 +175,9 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX900-LABEL: test_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT: s_mov_b32 s2, -1
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX900-NEXT: s_mov_b32 s3, 0xe00000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
@@ -199,10 +204,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
-; GFX1010-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT: s_mov_b32 s2, -1
+; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000
; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
-; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
@@ -311,10 +317,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
+; GFX803-NEXT: s_mov_b32 s2, -1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX803-NEXT: s_mov_b32 s3, 0x11e80000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
@@ -332,8 +339,9 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX900-LABEL: test_force_fp_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT: s_mov_b32 s2, -1
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX900-NEXT: s_mov_b32 s3, 0xe00000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
@@ -358,11 +366,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT: s_mov_b32 s2, -1
+; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000
; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: s_getpc_b64 s[16:17]
; GFX1010-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
@@ -413,10 +422,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX803-LABEL: test_force_fp_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
+; GFX803-NEXT: s_mov_b32 s2, -1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX803-NEXT: s_mov_b32 s3, 0x11e80000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 s33, 0
@@ -437,8 +447,9 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX900-LABEL: test_force_fp_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT: s_mov_b32 s2, -1
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX900-NEXT: s_mov_b32 s3, 0xe00000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_mov_b32 s33, 0
@@ -467,10 +478,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
-; GFX1010-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT: s_mov_b32 s2, -1
+; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000
; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
-; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 408d3b2c4c229..68c632a0bf6f4 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -180,7 +180,8 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -229,7 +230,8 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 955c0605f0536..ceeea64e2f9ea 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -22,9 +22,10 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; CHECK-NEXT: s_addc_u32 s9, s9, snork at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_and_b32 s4, 1, s7
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_cmp_eq_u32 s4, 1
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_cselect_b32 s5, s13, s11
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 594f3b33bfb36..2a8c009062228 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -12,7 +12,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
@@ -37,7 +38,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GISEL-NEXT: s_add_i32 s12, s12, s17
; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000
; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; GISEL-NEXT: s_mov_b32 s13, s15
; GISEL-NEXT: s_mov_b32 s12, s14
@@ -67,7 +69,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
@@ -93,7 +96,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GISEL-NEXT: s_add_i32 s12, s12, s17
; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000
; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; GISEL-NEXT: s_mov_b32 s13, s15
; GISEL-NEXT: s_mov_b32 s12, s14
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 4944a6db486ba..8843efd2c3c79 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -11,7 +11,8 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_mov_b32 s33, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 96dfb1fbf8017..4851c4f73456a 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -118,8 +118,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_add_u32 s8, s4, 8
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
@@ -177,8 +178,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_add_u32 s8, s4, 8
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
@@ -236,8 +238,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_add_u32 s8, s4, 8
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
@@ -295,8 +298,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_add_u32 s8, s4, 8
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
@@ -349,7 +353,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 0
; CHECK-NEXT: ds_write_b16 v0, v1
@@ -375,8 +380,9 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -418,7 +424,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 2
; CHECK-NEXT: ds_write_b16 v0, v1
@@ -444,8 +451,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -487,7 +495,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 1
; CHECK-NEXT: ds_write_b16 v0, v1
@@ -513,8 +522,9 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -556,7 +566,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 3
; CHECK-NEXT: ds_write_b16 v0, v1
@@ -582,8 +593,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 5c33d50382174..26271a0a68652 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -45,7 +45,8 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_getpc_b64 s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 34ca2793997a4..b4a09810b0404 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -164,7 +164,8 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -198,7 +199,8 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -240,8 +242,9 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s[2:3], 0xf0000000
-; GCN-NEXT: s[0:1], flat_scratch
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 9029e433700f5..46a2c6a14f695 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -229,7 +229,8 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -268,7 +269,8 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -310,7 +312,8 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index e88b76b30197c..1f5abe363d70e 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -49,9 +49,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_add_u32 s42, s34, 40
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_addc_u32 s43, s35, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
@@ -771,9 +772,10 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_add_u32 s42, s36, 40
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_addc_u32 s43, s37, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 46b69537798e5..70a9bbbd47a3e 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -69,7 +69,8 @@ define amdgpu_kernel void @kernel_call() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -128,7 +129,8 @@ define amdgpu_kernel void @kernel_tailcall() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -240,7 +242,8 @@ define protected amdgpu_kernel void @kernel() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 92fbbdb9c4e87..fa2cefc6bd3e0 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: s_mov_b64 s[2:3], 0xf0000000
-; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
; GFX9-NEXT: s_mul_i32 s4, s4, s5
@@ -55,8 +55,9 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; GFX9-NEXT: s_add_u32 s6, s6, indirect at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s7, s7, indirect at rel32@hi+12
; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: ds_write_b64 v0, v[3:4]
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 9510b400d1d6c..ecb4dbf5dfac9 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -75,8 +75,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_load_dwordx2 s[72:73], s[6:7], 0x0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS1-NEXT: s_mov_b32 s2, -1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3
-; GLOBALNESS1-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GLOBALNESS1-NEXT: s_mov_b32 s3, 0xe00000
; GLOBALNESS1-NEXT: s_mov_b64 s[0:1], flat_scratch
; GLOBALNESS1-NEXT: s_mov_b32 s68, s14
; GLOBALNESS1-NEXT: s_mov_b32 s69, s13
@@ -357,8 +358,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_load_dwordx2 s[72:73], s[6:7], 0x0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS0-NEXT: s_mov_b32 s2, -1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3
-; GLOBALNESS0-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; GLOBALNESS0-NEXT: s_mov_b32 s3, 0xe00000
; GLOBALNESS0-NEXT: s_mov_b64 s[0:1], flat_scratch
; GLOBALNESS0-NEXT: s_mov_b32 s66, s14
; GLOBALNESS0-NEXT: s_mov_b32 s67, s13
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index c8ca99926eab4..7d759089a7c0c 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -14,7 +14,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
; CHECK-NEXT: s_addc_u32 s11, s11, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
>From 50312359081a87a646c1cbd96f82edcfd946584b Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Fri, 2 Feb 2024 19:49:58 +0100
Subject: [PATCH 6/9] [AMDGPU] Compiler should synthesize private buffer
resource descriptor from flat_scratch_init
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 49 ++++++++++++----------
1 file changed, 26 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index f5c3efecd7316..f12b83403e118 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -378,6 +378,7 @@ class PrologEpilogSGPRSpillBuilder {
} // namespace llvm
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
+// and return the FlatScratchInit Register used
Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
@@ -420,10 +421,22 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
break;
}
}
- assert(FlatScratchInitReg && "Failed to find free register for scratch init");
- FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
- FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ } else {
+ FlatScratchInitReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(FlatScratchInitReg);
+ MBB.addLiveIn(FlatScratchInitReg);
+ }
+
+ assert(FlatScratchInitReg && "Failed to find free register for scratch init");
+
+ FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+
+ if (ST.isAmdPalOS()) {
buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
@@ -449,20 +462,9 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Mask the offset in [47:0] of the descriptor
const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
- .addReg(FlatScrInitHi)
- .addImm(0xffff);
+ .addReg(FlatScrInitHi)
+ .addImm(0xffff);
And->getOperand(3).setIsDead(); // Mark SCC as dead.
- } else {
- FlatScratchInitReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
- assert(FlatScratchInitReg);
-
- MachineRegisterInfo &MRI = MF.getRegInfo();
- MRI.addLiveIn(FlatScratchInitReg);
- MBB.addLiveIn(FlatScratchInitReg);
-
- FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
- FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
}
// Do a 64-bit pointer add.
@@ -488,10 +490,11 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
return FlatScratchInitReg;
}
- // For GFX9.
+ assert(ST.getGeneration() == AMDGPUSubtarget::GFX9);
+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
- .addReg(FlatScrInitLo)
- .addReg(ScratchWaveOffsetReg);
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
AMDGPU::FLAT_SCR_HI)
.addReg(FlatScrInitHi)
@@ -836,18 +839,18 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
if (FlatScratchInit) {
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
- Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
- Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+ Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
.addReg(FlatScratchInit)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+ BuildMI(MBB, I, DL, SMovB32, Lo_32)
.addImm(Rsrc23 & 0xffffffff)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+ BuildMI(MBB, I, DL, SMovB32, Hi_32)
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
return;
>From de3555e7d6ccc67c535a2ca56a930605f6de8465 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Fri, 2 Feb 2024 19:49:58 +0100
Subject: [PATCH 7/9] [AMDGPU] Compiler should synthesize private buffer
resource descriptor from flat_scratch_init
---
llvm/docs/AMDGPUUsage.rst | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5fcf651046943..a1a494e9df723 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5468,9 +5468,13 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
Instead the flat SCRATCH instructions are used.
Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
-that are used as a V# to access scratch. CP uses the value provided by the
-runtime. It is used, together with Scratch Wavefront Offset as an offset, to
-access the private memory space using a segment address. See
+that are used as a V# to access scratch.
+FIXME: The compiler synthetizes the initialization value for the Private Segment
+Buffer in the kernel prologue, using the Flat Scratch Init to initialize low
+64-bit and a known constant for the high ones. If the Flat Scratch Init is not
+available, CP uses the value provided by the runtime. It is used, together with
+Scratch Wavefront Offset as an offset, to access the private memory space using
+a segment address. See
:ref:`amdgpu-amdhsa-initial-kernel-execution-state`.
The scratch V# is a four-aligned SGPR and always selected for the kernel as
>From 3b3651691b34ee0a9cb24d4b270bf6c60084807d Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Fri, 2 Feb 2024 19:49:58 +0100
Subject: [PATCH 8/9] [AMDGPU] Compiler should synthesize private buffer
resource descriptor from flat_scratch_init
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index f12b83403e118..8152596c63798 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -704,7 +704,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
-
if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
>From 87408092c6516564c14ea72b2166352b0fa10f28 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Fri, 2 Feb 2024 19:49:58 +0100
Subject: [PATCH 9/9] [AMDGPU] Compiler should synthesize private buffer
resource descriptor from flat_scratch_init
---
llvm/docs/AMDGPUUsage.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index a1a494e9df723..6298211f7bc95 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5469,7 +5469,7 @@ Instead the flat SCRATCH instructions are used.
Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
that are used as a V# to access scratch.
-FIXME: The compiler synthetizes the initialization value for the Private Segment
+The compiler synthesizes the initialization value for the Private Segment
Buffer in the kernel prologue, using the Flat Scratch Init to initialize low
64-bit and a known constant for the high ones. If the Flat Scratch Init is not
available, CP uses the value provided by the runtime. It is used, together with
More information about the llvm-commits
mailing list