[llvm] 88e5251 - [AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init (#79586)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 8 11:27:39 PST 2024
Author: alex-t
Date: 2024-02-08T20:27:36+01:00
New Revision: 88e52511ca71165f1ff3d7c42229aeacb2c16db3
URL: https://github.com/llvm/llvm-project/commit/88e52511ca71165f1ff3d7c42229aeacb2c16db3
DIFF: https://github.com/llvm/llvm-project/commit/88e52511ca71165f1ff3d7c42229aeacb2c16db3.diff
LOG: [AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init (#79586)
This change implements synthesizing the private buffer resource
descriptor in the kernel prolog instead of using the preloaded kernel
argument.
Added:
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/lib/Target/AMDGPU/SIFrameLowering.h
llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
llvm/test/CodeGen/AMDGPU/call-argument-types.ll
llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
llvm/test/CodeGen/AMDGPU/cc-update.ll
llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
llvm/test/CodeGen/AMDGPU/indirect-call.ll
llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6b2417143ca06..301996847a584 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5530,9 +5530,13 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
Instead the flat SCRATCH instructions are used.
Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
-that are used as a V# to access scratch. CP uses the value provided by the
-runtime. It is used, together with Scratch Wavefront Offset as an offset, to
-access the private memory space using a segment address. See
+that are used as a V# to access scratch.
+The compiler synthesizes the initialization value for the Private Segment
+Buffer in the kernel prologue, using the Flat Scratch Init to initialize low
+64-bit and a known constant for the high ones. If the Flat Scratch Init is not
+available, CP uses the value provided by the runtime. It is used, together with
+Scratch Wavefront Offset as an offset, to access the private memory space using
+a segment address. See
:ref:`amdgpu-amdhsa-initial-kernel-execution-state`.
The scratch V# is a four-aligned SGPR and always selected for the kernel as
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d02aee71870ec..6327a818a12ed 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -379,7 +379,8 @@ class PrologEpilogSGPRSpillBuilder {
} // namespace llvm
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
-void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+// and return the FlatScratchInit Register used
+Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -399,6 +400,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
Register FlatScrInitLo;
Register FlatScrInitHi;
+ Register FlatScratchInitReg;
if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
@@ -408,7 +410,6 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
- Register FlatScrInit = AMDGPU::NoRegister;
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
AllSGPR64s = AllSGPR64s.slice(
@@ -417,16 +418,28 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
for (MCPhysReg Reg : AllSGPR64s) {
if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
- FlatScrInit = Reg;
+ FlatScratchInitReg = Reg;
break;
}
}
- assert(FlatScrInit && "Failed to find free register for scratch init");
- FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
- FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
+ } else {
+ FlatScratchInitReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(FlatScratchInitReg);
+ MBB.addLiveIn(FlatScratchInitReg);
+ }
+
+ assert(FlatScratchInitReg && "Failed to find free register for scratch init");
+
+ FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+
+ if (ST.isAmdPalOS()) {
- buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+ buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).
@@ -441,8 +454,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
- BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
- .addReg(FlatScrInit)
+ BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
+ .addReg(FlatScratchInitReg)
.addImm(EncodedOffset) // offset
.addImm(0) // cpol
.addMemOperand(MMO);
@@ -450,20 +463,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Mask the offset in [47:0] of the descriptor
const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
- .addReg(FlatScrInitHi)
- .addImm(0xffff);
+ .addReg(FlatScrInitHi)
+ .addImm(0xffff);
And->getOperand(3).setIsDead(); // Mark SCC as dead.
- } else {
- Register FlatScratchInitReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
- assert(FlatScratchInitReg);
-
- MachineRegisterInfo &MRI = MF.getRegInfo();
- MRI.addLiveIn(FlatScratchInitReg);
- MBB.addLiveIn(FlatScratchInitReg);
-
- FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
- FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
}
// Do a 64-bit pointer add.
@@ -486,20 +488,21 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
addReg(FlatScrInitHi).
addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
(31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
- return;
+ return FlatScratchInitReg;
}
- // For GFX9.
+ assert(ST.getGeneration() == AMDGPUSubtarget::GFX9);
+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
- .addReg(FlatScrInitLo)
- .addReg(ScratchWaveOffsetReg);
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
AMDGPU::FLAT_SCR_HI)
.addReg(FlatScrInitHi)
.addImm(0);
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
- return;
+ return AMDGPU::FLAT_SCR;
}
assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
@@ -520,6 +523,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addReg(FlatScrInitLo, RegState::Kill)
.addImm(8);
LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
+ return AMDGPU::FLAT_SCR;
}
// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -611,11 +615,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const Function &F = MF.getFunction();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
assert(MFI->isEntryFunction());
+ bool NeedsFlatScratchInit =
+ MFI->getUserSGPRInfo().hasFlatScratchInit() &&
+ (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+ (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
@@ -641,7 +649,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Now that we have fixed the reserved SRSRC we need to locate the
// (potentially) preloaded SRSRC.
Register PreloadedScratchRsrcReg;
- if (ST.isAmdHsaOrMesa(F)) {
+ if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
PreloadedScratchRsrcReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
@@ -697,33 +705,30 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- bool NeedsFlatScratchInit =
- MFI->getUserSGPRInfo().hasFlatScratchInit() &&
- (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
- (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
-
if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
+ Register FlatScratchInit;
if (NeedsFlatScratchInit) {
- emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+ FlatScratchInit =
+ emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}
if (ScratchRsrcReg) {
- emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
- PreloadedScratchRsrcReg,
- ScratchRsrcReg, ScratchWaveOffsetReg);
+ emitEntryFunctionScratchRsrcRegSetup(
+ MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
+ PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
}
}
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, Register PreloadedScratchRsrcReg,
- Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+ const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
+ Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -771,7 +776,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(21)
.addReg(Rsrc03);
}
- } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
+ } else if (ST.isMesaGfxShader(Fn) ||
+ (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -830,6 +836,26 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else if (ST.isAmdHsaOrMesa(Fn)) {
+
+ if (FlatScratchInit) {
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+ Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+ uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
+ TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+ .addReg(FlatScratchInit)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ BuildMI(MBB, I, DL, SMovB32, Lo_32)
+ .addImm(Rsrc23 & 0xffffffff)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Hi_32)
+ .addImm(Rsrc23 >> 32)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ return;
+ }
+
assert(PreloadedScratchRsrcReg);
if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed811..f706d48b2dc10 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -67,19 +67,19 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
MachineBasicBlock::iterator MI) const override;
private:
- void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const DebugLoc &DL,
- Register ScratchWaveOffsetReg) const;
+ Register
+ emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register ScratchWaveOffsetReg) const;
Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
void emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
- Register ScratchWaveOffsetReg) const;
+ Register FlatScratchInit, Register ScratchRsrcReg,
+ Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
public:
bool hasFP(const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a6..6e49a5a4ec0e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -13,10 +13,11 @@ define amdgpu_kernel void @kernel_caller_stack() {
; MUBUF-LABEL: kernel_caller_stack:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT: s_mov_b32 s2, -1
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_add_u32 s0, s0, s7
+; MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; MUBUF-NEXT: s_mov_b32 s32, 0
-; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
@@ -61,9 +62,10 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF-LABEL: kernel_caller_byval:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT: s_mov_b32 s2, -1
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_add_u32 s0, s0, s7
-; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index a439c0f51ffe9..609b5e6f49ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,19 +48,20 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
-; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT: s_mov_b32 s2, -1
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT: s_mov_b32 s3, 0x11e80000
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6
; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7
; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
@@ -70,19 +71,20 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
-; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT: s_mov_b32 s2, -1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT: s_mov_b32 s3, 0x11e80000
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6
; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7
; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 7c8d40c49bb80..74c6bb599cb9b 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,8 +10,9 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5a128c7541d1e..c06f213b9eb66 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,13 +5,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-LABEL: name: f1
; GFX90A: bb.0.bb:
; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
+ ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0
; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
- ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 87e17a1c82080..381fb9891e538 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,12 +129,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i1_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
@@ -234,8 +235,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
@@ -339,8 +341,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
@@ -422,12 +425,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_i8_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
@@ -525,8 +529,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
@@ -625,8 +630,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
@@ -707,12 +713,13 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
@@ -809,8 +816,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
@@ -909,8 +917,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
@@ -991,12 +1000,13 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 42
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
@@ -1078,13 +1088,14 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: v_mov_b32_e32 v1, 0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
@@ -1182,12 +1193,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_mov_b32 s4, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
@@ -1278,15 +1290,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2i64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
@@ -1391,12 +1404,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_mov_b32 s4, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v4, 1
; HSA-NEXT: v_mov_b32_e32 v5, 2
; HSA-NEXT: s_mov_b32 s32, 0
@@ -1514,12 +1528,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_mov_b32 s4, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_mov_b32 s5, s4
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v4, 1
; HSA-NEXT: v_mov_b32_e32 v5, 2
; HSA-NEXT: v_mov_b32_e32 v6, 3
@@ -1605,12 +1620,13 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x4400
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
@@ -1689,12 +1705,13 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 4.0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
@@ -1776,13 +1793,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
@@ -1868,14 +1886,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 4.0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
@@ -1968,16 +1987,17 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v5f32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 4.0
; HSA-NEXT: v_mov_b32_e32 v3, -1.0
; HSA-NEXT: v_mov_b32_e32 v4, 0.5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
@@ -2059,13 +2079,14 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
@@ -2154,15 +2175,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 0
; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
@@ -2258,9 +2280,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f64_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 0
@@ -2268,7 +2292,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; HSA-NEXT: v_mov_b32_e32 v4, 0
; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
@@ -2357,14 +2380,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; HSA-LABEL: test_call_external_void_func_v2i16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
@@ -2456,14 +2480,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; HSA-LABEL: test_call_external_void_func_v3i16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
@@ -2556,14 +2581,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; HSA-LABEL: test_call_external_void_func_v3f16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
@@ -2647,13 +2673,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x20001
; HSA-NEXT: v_mov_b32_e32 v1, 3
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
@@ -2737,13 +2764,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v3f16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00
; HSA-NEXT: v_mov_b32_e32 v1, 0x4400
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
@@ -2835,14 +2863,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; HSA-LABEL: test_call_external_void_func_v4i16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
@@ -2928,13 +2957,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v4i16_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x20001
; HSA-NEXT: v_mov_b32_e32 v1, 0x40003
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
@@ -3025,14 +3055,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; HSA-LABEL: test_call_external_void_func_v2f16:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
@@ -3120,14 +3151,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; HSA-LABEL: test_call_external_void_func_v2i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
@@ -3210,13 +3242,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v2i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
@@ -3302,14 +3335,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_v3i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: v_mov_b32_e32 v1, 4
; HSA-NEXT: v_mov_b32_e32 v2, 5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
@@ -3398,15 +3432,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; HSA-LABEL: test_call_external_void_func_v3i32_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: v_mov_b32_e32 v1, 4
; HSA-NEXT: v_mov_b32_e32 v2, 5
; HSA-NEXT: v_mov_b32_e32 v3, 6
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
@@ -3493,14 +3528,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; HSA-LABEL: test_call_external_void_func_v4i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 s2, -1
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
@@ -3590,15 +3626,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v4i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
@@ -3691,16 +3728,17 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v5i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: v_mov_b32_e32 v4, 5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
@@ -3803,13 +3841,14 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
@@ -3915,9 +3954,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; HSA-LABEL: test_call_external_void_func_v8i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
@@ -3927,7 +3968,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; HSA-NEXT: v_mov_b32_e32 v6, 7
; HSA-NEXT: v_mov_b32_e32 v7, 8
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
@@ -4038,7 +4078,6 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4046,7 +4085,9 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
@@ -4183,7 +4224,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4195,8 +4235,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
@@ -4359,9 +4401,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT: s_add_u32 s0, s0, s9
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
@@ -4466,14 +4509,15 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
;
; HSA-LABEL: test_call_external_i32_func_i32_imm:
; HSA: ; %bb.0:
-; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
+; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 42
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_mov_b32 s39, 0x1100f000
; HSA-NEXT: s_mov_b32 s38, -1
; HSA-NEXT: s_getpc_b64 s[4:5]
@@ -4581,13 +4625,14 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; HSA-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
@@ -4702,9 +4747,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8
; HSA-NEXT: v_mov_b32_e32 v0, 8
@@ -4712,7 +4759,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
; HSA-NEXT: s_movk_i32 s32, 0x400
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
@@ -4877,9 +4923,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8
; HSA-NEXT: v_mov_b32_e32 v0, 8
@@ -4887,7 +4935,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
; HSA-NEXT: s_movk_i32 s32, 0x800
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
@@ -5085,12 +5132,13 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT: s_add_u32 s0, s0, s7
; HSA-NEXT: s_mov_b32 s7, 0x1100f000
; HSA-NEXT: s_mov_b32 s6, -1
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
@@ -5339,14 +5387,15 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64:
; HSA: ; %bb.0: ; %entry
; HSA-NEXT: s_add_i32 s6, s6, s9
-; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT: s_add_u32 s0, s0, s9
+; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; HSA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x80
; HSA-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; HSA-NEXT: s_mov_b32 s2, -1
+; HSA-NEXT: s_mov_b32 s3, 0x11e80000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: v_mov_b32_e32 v0, s23
; HSA-NEXT: v_mov_b32_e32 v1, s6
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index c62a082459105..8e2fca554e28c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -11,10 +11,11 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-LABEL: known_x_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -31,9 +32,10 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-LABEL: known_y_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -50,9 +52,10 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-LABEL: known_z_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -69,9 +72,10 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK-LABEL: known_yz_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -88,9 +92,10 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK-LABEL: known_xz_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -108,9 +113,10 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
; CHECK-LABEL: known_xyz_0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: v_mov_b32_e32 v31, 0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 616e5f00fc1e5..6db5effdf04ed 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -7,12 +7,13 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func at rel32@lo+4
@@ -30,10 +31,11 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v0, v0, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
@@ -52,11 +54,12 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -74,11 +77,12 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call_return_val:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
@@ -99,12 +103,13 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, got.func at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, got.func at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 6d603ef039769..49bf48a3687c9 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -165,7 +165,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI: v_or_b32_e32 v31, v0, v1
; FIXEDABI-NOT: v0
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
@@ -181,7 +181,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI: v_or_b32_e32 v31, v0, v1
; FIXEDABI-NOT: v0
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
; FIXEDABI-NOT: v2
; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2
; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0
+; FIXEDABI: v_or_b32_e32 v31, v1, v0
; FIXEDABI-NOT: v0
; FIXEDABI-NOT: v1
; FIXEDABI-NOT: v2
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index ca09163b20afc..42beb1c8ae256 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -68,13 +68,14 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b32 s2, -1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s15
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b32 s3, 0x11e80000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -88,11 +89,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX900-LABEL: test_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT: s_mov_b32 s2, -1
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s15
+; GFX900-NEXT: s_mov_b32 s3, 0xe00000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
+; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -112,11 +114,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: s_add_u32 s0, s0, s15
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_mov_b32 s2, -1
+; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: s_getpc_b64 s[16:17]
; GFX1010-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
@@ -148,13 +151,14 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b32 s2, -1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s15
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b32 s3, 0x11e80000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
@@ -171,11 +175,12 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX900-LABEL: test_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT: s_mov_b32 s2, -1
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s15
+; GFX900-NEXT: s_mov_b32 s3, 0xe00000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
+; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
@@ -199,10 +204,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
-; GFX1010-NEXT: s_add_u32 s0, s0, s15
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
-; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GFX1010-NEXT: s_mov_b32 s2, -1
+; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
@@ -311,13 +317,14 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b32 s2, -1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s15
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b32 s3, 0x11e80000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -332,11 +339,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX900-LABEL: test_force_fp_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT: s_mov_b32 s2, -1
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s15
+; GFX900-NEXT: s_mov_b32 s3, 0xe00000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
+; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -358,11 +366,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT: s_add_u32 s0, s0, s15
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
+; GFX1010-NEXT: s_mov_b32 s2, -1
+; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: s_getpc_b64 s[16:17]
; GFX1010-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
@@ -413,14 +422,15 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX803-LABEL: test_force_fp_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b32 s2, -1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT: s_add_u32 s0, s0, s15
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT: s_mov_b32 s3, 0x11e80000
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 s33, 0
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
+; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
@@ -437,12 +447,13 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX900-LABEL: test_force_fp_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT: s_mov_b32 s2, -1
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT: s_add_u32 s0, s0, s15
+; GFX900-NEXT: s_mov_b32 s3, 0xe00000
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_mov_b32 s33, 0
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
+; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
@@ -467,10 +478,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
-; GFX1010-NEXT: s_add_u32 s0, s0, s15
-; GFX1010-NEXT: s_addc_u32 s1, s1, 0
-; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GFX1010-NEXT: s_mov_b32 s2, -1
+; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11]
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 11871db1ef656..68c632a0bf6f4 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -180,8 +180,9 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT: s_add_u32 s0, s0, s15
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s8, 0
@@ -229,8 +230,9 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT: s_add_u32 s0, s0, s15
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xe00000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 47110d9491887..2d019efe2417a 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -13,8 +13,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-NEXT: s_add_u32 s0, s0, s7
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0
; GFX9-NEXT: s_getpc_b64 s[4:5]
@@ -25,14 +23,17 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX9-NEXT: s_addc_u32 s9, s9, snork at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, 1, s7
+; GFX9-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
-; GFX9-NEXT: v_mov_b32_e32 v31, v0
+; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX9-NEXT: s_cselect_b32 s5, s13, s11
; GFX9-NEXT: s_cselect_b32 s4, s12, s10
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 408199bbc9223..a66ed939fef60 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -12,8 +12,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_getpc_b64 s[14:15]
@@ -37,8 +38,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GISEL-NEXT: s_add_i32 s12, s12, s17
; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT: s_add_u32 s0, s0, s17
-; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000
+; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; GISEL-NEXT: s_mov_b32 s13, s15
; GISEL-NEXT: s_mov_b32 s12, s14
; GISEL-NEXT: s_getpc_b64 s[14:15]
@@ -67,8 +69,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s0, s0, s17
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_getpc_b64 s[14:15]
@@ -93,8 +96,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GISEL-NEXT: s_add_i32 s12, s12, s17
; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT: s_add_u32 s0, s0, s17
-; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000
+; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; GISEL-NEXT: s_mov_b32 s13, s15
; GISEL-NEXT: s_mov_b32 s12, s14
; GISEL-NEXT: s_getpc_b64 s[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 6e905542ce53c..8843efd2c3c79 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -11,8 +11,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_mov_b32 s33, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 66f31bbf7afe0..4851c4f73456a 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -118,10 +118,11 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_add_u32 s8, s4, 8
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -177,10 +178,11 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_add_u32 s8, s4, 8
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -236,10 +238,11 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_add_u32 s8, s4, 8
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -295,10 +298,11 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_add_u32 s8, s4, 8
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -341,8 +345,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -351,6 +353,9 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 0
; CHECK-NEXT: ds_write_b16 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -370,14 +375,15 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -410,8 +416,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -420,6 +424,9 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 2
; CHECK-NEXT: ds_write_b16 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -439,14 +446,15 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -479,8 +487,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -489,6 +495,9 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 1
; CHECK-NEXT: ds_write_b16 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -508,14 +517,15 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -548,8 +558,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
@@ -558,6 +566,9 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s15, 3
; CHECK-NEXT: ds_write_b16 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -577,14 +588,15 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
; CHECK-NEXT: s_addc_u32 s7, s7, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT: s_add_u32 s0, s0, s9
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_add_u32 s8, s4, 8
; CHECK-NEXT: s_addc_u32 s9, s5, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 61818dafd2b84..26271a0a68652 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -45,8 +45,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, function_lds_id at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index bb7c43f76c8a1..f780188deaec1 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -164,8 +164,9 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f0 at gotpcrel32@lo+4
@@ -198,8 +199,9 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f2 at gotpcrel32@lo+4
@@ -240,8 +242,9 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 4d73436c519bd..fa4b93fd1d6b7 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -229,8 +229,9 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f0 at gotpcrel32@lo+4
@@ -268,8 +269,9 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f2 at gotpcrel32@lo+4
@@ -310,8 +312,9 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
; GCN-NEXT: s_add_i32 s6, s6, s9
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 138a6a86cee98..e17f311b11d8b 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -44,17 +44,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v40, v0
; CHECK-NEXT: s_add_u32 s42, s34, 40
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_addc_u32 s43, s35, 0
-; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s33, s14
; CHECK-NEXT: s_mov_b32 s40, s13
; CHECK-NEXT: s_mov_b32 s41, s12
@@ -781,17 +782,18 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx2 s[44:45], s[6:7], 0x10
-; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7]
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v40, v0
; CHECK-NEXT: s_add_u32 s42, s36, 40
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_addc_u32 s43, s37, 0
-; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s33, s14
; CHECK-NEXT: s_mov_b32 s40, s13
; CHECK-NEXT: s_mov_b32 s41, s12
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index f70441e87a74b..70a9bbbd47a3e 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -69,8 +69,9 @@ define amdgpu_kernel void @kernel_call() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[16:17]
@@ -128,8 +129,9 @@ define amdgpu_kernel void @kernel_tailcall() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[16:17]
@@ -240,8 +242,9 @@ define protected amdgpu_kernel void @kernel() {
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0xe00000
+; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: s_getpc_b64 s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index e7c5aaf043efb..e6d9c0d6105f5 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GFX9-NEXT: s_add_u32 s0, s0, s15
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
; GFX9-NEXT: s_mul_i32 s4, s4, s5
@@ -55,8 +55,9 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; GFX9-NEXT: s_add_u32 s6, s6, indirect at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s7, s7, indirect at rel32@hi+12
; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch
; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: ds_write_b64 v0, v[3:4]
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 1118cc3b16463..8d8459ff0b1b2 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -45,10 +45,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18
; GLOBALNESS1-NEXT: s_load_dword s7, s[38:39], 0x20
; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s15
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400
; GLOBALNESS1-NEXT: s_bitcmp1_b32 s74, 0
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
@@ -73,7 +71,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS1-NEXT: s_mov_b32 s2, -1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3
+; GLOBALNESS1-NEXT: s_mov_b32 s3, 0xe00000
+; GLOBALNESS1-NEXT: s_mov_b64 s[0:1], flat_scratch
; GLOBALNESS1-NEXT: s_mov_b32 s68, s14
; GLOBALNESS1-NEXT: s_mov_b32 s69, s13
; GLOBALNESS1-NEXT: s_mov_b32 s70, s12
@@ -332,10 +333,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18
; GLOBALNESS0-NEXT: s_load_dword s7, s[38:39], 0x20
; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
-; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s15
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400
; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
@@ -360,7 +359,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS0-NEXT: s_mov_b32 s2, -1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3
+; GLOBALNESS0-NEXT: s_mov_b32 s3, 0xe00000
+; GLOBALNESS0-NEXT: s_mov_b64 s[0:1], flat_scratch
; GLOBALNESS0-NEXT: s_mov_b32 s66, s14
; GLOBALNESS0-NEXT: s_mov_b32 s67, s13
; GLOBALNESS0-NEXT: s_mov_b32 s68, s12
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index 7840559c78eb6..7d759089a7c0c 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -14,8 +14,9 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
; CHECK-NEXT: s_addc_u32 s11, s11, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
; CHECK-NEXT: v_mov_b32_e32 v5, 42
More information about the llvm-commits
mailing list