[llvm] d5a4658 - [AMDGPU] Omit buffer resource with flat scratch.
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 9 08:20:55 PST 2020
Author: Stanislav Mekhanoshin
Date: 2020-11-09T08:05:20-08:00
New Revision: d5a465866eea7f1779869c679a4c25ac2cbae59e
URL: https://github.com/llvm/llvm-project/commit/d5a465866eea7f1779869c679a4c25ac2cbae59e
DIFF: https://github.com/llvm/llvm-project/commit/d5a465866eea7f1779869c679a4c25ac2cbae59e.diff
LOG: [AMDGPU] Omit buffer resource with flat scratch.
Differential Revision: https://reviews.llvm.org/D90979
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
llvm/test/CodeGen/AMDGPU/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/load-lo16.ll
llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index af58df181918..cdea537a09dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -939,7 +939,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (IsEntryFunc) {
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
} else {
- CCInfo.AllocateReg(Info->getScratchRSrcReg());
+ if (!Subtarget.enableFlatScratch())
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -1227,12 +1228,14 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // Insert copies for the SRD. In the HSA case, this should be an identity
- // copy.
- auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
- MFI->getScratchRSrcReg());
- MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
- MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+ if (!ST.enableFlatScratch()) {
+ // Insert copies for the SRD. In the HSA case, this should be an identity
+ // copy.
+ auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
+ MFI->getScratchRSrcReg());
+ MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+ }
for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index f8cc31c0503a..1158f9360b03 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -467,7 +467,9 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
//
// This will return `Register()` in cases where there are no actual
// uses of the SRSRC.
- Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
+ Register ScratchRsrcReg;
+ if (!ST.enableFlatScratch())
+ ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
// Make the selected register live throughout the function.
if (ScratchRsrcReg) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7987ac72e451..217b6387f266 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2117,26 +2117,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
- // If we have stack objects, we unquestionably need the private buffer
- // resource. For the Code Object V2 ABI, this will be the first 4 user
- // SGPR inputs. We can reserve those and use them directly.
-
- Register PrivateSegmentBufferReg =
- Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
- Info.setScratchRSrcReg(PrivateSegmentBufferReg);
- } else {
- unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
- // We tentatively reserve the last registers (skipping the last registers
- // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
- // we'll replace these with the ones immediately after those which were
- // really allocated. In the prologue copies will be inserted from the
- // argument to these reserved registers.
+ if (!ST.enableFlatScratch()) {
+ if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the Code Object V2 ABI, this will be the first 4 user
+ // SGPR inputs. We can reserve those and use them directly.
+
+ Register PrivateSegmentBufferReg =
+ Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+ Info.setScratchRSrcReg(PrivateSegmentBufferReg);
+ } else {
+ unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
+ // We tentatively reserve the last registers (skipping the last registers
+ // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
+ // we'll replace these with the ones immediately after those which were
+ // really allocated. In the prologue copies will be inserted from the
+ // argument to these reserved registers.
- // Without HSA, relocations are used for the scratch pointer and the
- // buffer resource setup is always inserted in the prologue. Scratch wave
- // offset is still in an input SGPR.
- Info.setScratchRSrcReg(ReservedBufferReg);
+ // Without HSA, relocations are used for the scratch pointer and the
+ // buffer resource setup is always inserted in the prologue. Scratch wave
+ // offset is still in an input SGPR.
+ Info.setScratchRSrcReg(ReservedBufferReg);
+ }
}
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -3012,14 +3014,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!IsSibCall) {
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
- SmallVector<SDValue, 4> CopyFromChains;
+ if (!Subtarget->enableFlatScratch()) {
+ SmallVector<SDValue, 4> CopyFromChains;
- // In the HSA case, this should be an identity copy.
- SDValue ScratchRSrcReg
- = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
- RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
- CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
- Chain = DAG.getTokenFactor(DL, CopyFromChains);
+ // In the HSA case, this should be an identity copy.
+ SDValue ScratchRSrcReg
+ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
+ Chain = DAG.getTokenFactor(DL, CopyFromChains);
+ }
}
MVT PtrVT = MVT::i32;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e6c4e9cd6610..8c10a971115f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1379,11 +1379,14 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
+ Register ScratchRSrc =
+ ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
+ : MFI->getScratchRSrcReg();
BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
- .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+ .addReg(ScratchRSrc, RegState::Implicit)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
// Add the scratch resource registers as implicit uses because we may end up
// needing them, and need to ensure that the reserved registers are
@@ -1397,10 +1400,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
: getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
+ Register ScratchRSrc =
+ ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
+ : MFI->getScratchRSrcReg();
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(ScratchRSrc) // scratch_rsrc
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
@@ -1513,21 +1519,27 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
+ Register ScratchRSrc =
+ ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
+ : MFI->getScratchRSrcReg();
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
- .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+ .addReg(ScratchRSrc, RegState::Implicit)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
return;
}
unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
: getVGPRSpillRestoreOpcode(SpillSize);
+ Register ScratchRSrc =
+ ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
+ : MFI->getScratchRSrcReg();
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(ScratchRSrc) // scratch_rsrc
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 046dcf42a02a..8032bc5f9de9 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -75,16 +75,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
if (!isEntryFunction()) {
- // Non-entry functions have no special inputs for now, other registers
- // required for scratch access.
- ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
-
// TODO: Pick a high register, and shift down, similar to a kernel.
FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
- ArgInfo.PrivateSegmentBuffer =
- ArgDescriptor::createRegister(ScratchRSrcReg);
+ if (!ST.enableFlatScratch()) {
+ // Non-entry functions have no special inputs for now, other registers
+ // required for scratch access.
+ ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+
+ ArgInfo.PrivateSegmentBuffer =
+ ArgDescriptor::createRegister(ScratchRSrcReg);
+ }
if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
@@ -142,7 +144,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
if (isAmdHsaOrMesa) {
- PrivateSegmentBuffer = true;
+ if (!ST.enableFlatScratch())
+ PrivateSegmentBuffer = true;
if (UseFixedABI) {
DispatchPtr = true;
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 56748eafab28..b3af5fc946df 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -35,8 +35,10 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s4, v40, 2
-; GCN-DAG: v_readlane_b32 s5, v40, 3
+; MUBUF-DAG: v_readlane_b32 s4, v40, 2
+; MUBUF-DAG: v_readlane_b32 s5, v40, 3
+; FLATSCR-DAG: v_readlane_b32 s0, v40, 2
+; FLATSCR-DAG: v_readlane_b32 s1, v40, 3
; GCN: v_readlane_b32 s35, v40, 1
; GCN: v_readlane_b32 s34, v40, 0
@@ -134,14 +136,18 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
; FIXME: What is the expected behavior for reserved registers here?
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; MUBUF: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; FLATSCR: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN: #ASMSTART
; GCN-NEXT: ; def s33
; GCN-NEXT: #ASMEND
-; GCN: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF: s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN: ;;#ASMSTART
; GCN-NEXT: ; use s33
; GCN-NEXT: ;;#ASMEND
@@ -157,9 +163,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
; GCN-NOT: s34
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; MUBUF: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; FLATSCR: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN-NOT: s34
@@ -168,7 +177,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
; GCN-NEXT: ;;#ASMEND
; GCN-NOT: s34
-; GCN: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF: s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN-NOT: s34
@@ -186,9 +196,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
; GCN-NOT: v32
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; MUBUF: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; FLATSCR: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN-NOT: v40
@@ -196,7 +209,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
; GCN-NEXT: ; def v40
; GCN-NEXT: ;;#ASMEND
-; GCN: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF: s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN-NOT: v40
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 7bc9dcfb20a8..20d50200e3a2 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -13,9 +13,10 @@ define void @callee_no_stack() #0 {
; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
+; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
; GCN-NEXT: s_setpc_b64
define void @callee_no_stack_no_fp_elim_all() #1 {
ret void
@@ -48,7 +49,8 @@ define void @callee_with_stack() #0 {
; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
+; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
; GCN-NEXT: s_mov_b32 s33, s32
; MUBUF-NEXT: s_add_u32 s32, s32, 0x200
; FLATSCR-NEXT: s_add_u32 s32, s32, 8
@@ -57,7 +59,7 @@ define void @callee_with_stack() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}}
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200
; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_no_fp_elim_all() #1 {
@@ -100,8 +102,10 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
; GCN: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
-; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
+; MUBUF-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
+; MUBUF-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
+; FLATSCR-DAG: v_readlane_b32 s0, [[CSR_VGPR]]
+; FLATSCR-DAG: v_readlane_b32 s1, [[CSR_VGPR]]
; MUBUF: s_sub_u32 s32, s32, 0x400{{$}}
; FLATSCR: s_sub_u32 s32, s32, 16{{$}}
@@ -140,8 +144,10 @@ define void @callee_with_stack_and_call() #0 {
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
; GCN: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s4, v40, 0
-; GCN-DAG: v_readlane_b32 s5, v40, 1
+; MUBUF-DAG: v_readlane_b32 s4, v40, 0
+; MUBUF-DAG: v_readlane_b32 s5, v40, 1
+; FLATSCR-DAG: v_readlane_b32 s0, v40, 0
+; FLATSCR-DAG: v_readlane_b32 s1, v40, 1
; MUBUF: s_sub_u32 s32, s32, 0x400
; FLATSCR: s_sub_u32 s32, s32, 16
@@ -238,9 +244,10 @@ define void @spill_only_csr_sgpr() {
; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
; MUBUF: s_add_u32 s32, s32, 0x300
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300
+; MUBUF-NEXT: s_mov_b32 s33, s4
; FLATSCR: s_add_u32 s32, s32, 12
; FLATSCR-NEXT: s_sub_u32 s32, s32, 12
-; GCN-NEXT: s_mov_b32 s33, s4
+; FLATSCR-NEXT: s_mov_b32 s33, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
@@ -330,7 +337,8 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; GCN: s_waitcnt
; MUBUF-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff
-; GCN-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
+; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
; MUBUF-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000
; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000
; MUBUF-NEXT: s_add_u32 s32, s32, 0x100000
@@ -340,7 +348,7 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x100000
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @realign_stack_no_fp_elim() #1 {
@@ -359,15 +367,18 @@ define void @realign_stack_no_fp_elim() #1 {
; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s4, v1, 0
+; MUBUF: v_readlane_b32 s4, v1, 0
; MUBUF-NEXT: s_add_u32 s32, s32, 0x200
+; MUBUF-NEXT: v_readlane_b32 s5, v1, 1
+; FLATSCR: v_readlane_b32 s0, v1, 0
; FLATSCR-NEXT: s_add_u32 s32, s32, 8
-; GCN-NEXT: v_readlane_b32 s5, v1, 1
+; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200
; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
-; GCN-NEXT: v_readlane_b32 s33, v1, 2
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[4:5]
+; GCN-NEXT: v_readlane_b32 s33, v1, 2
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[4:5]
+; FLATSCR-NEXT: s_setpc_b64 s[0:1]
define void @no_unused_non_csr_sgpr_for_fp() #1 {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
@@ -399,9 +410,11 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; MUBUF: s_add_u32 s32, s32, 0x300{{$}}
; FLATSCR: s_add_u32 s32, s32, 12{{$}}
-; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
+; MUBUF: v_readlane_b32 s4, [[CSR_VGPR]], 0
+; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1
+; MUBUF: v_readlane_b32 s5, [[CSR_VGPR]], 1
+; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300{{$}}
; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}}
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
@@ -450,9 +463,11 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; MUBUF-DAG: buffer_store_dword
; FLATSCR-DAG: scratch_store_dword
-; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
+; MUBUF: v_readlane_b32 s4, [[CSR_VGPR]], 0
+; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1
+; MUBUF: v_readlane_b32 s5, [[CSR_VGPR]], 1
+; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x40300{{$}}
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}}
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
@@ -514,20 +529,21 @@ define void @ipra_call_with_stack() #0 {
; With no free registers, we must spill the FP to memory.
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
+; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
-; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:4
-; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN: s_mov_b32 s33, s32
-; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
+; FLATSCR: s_mov_b32 s0, s33
+; GCN: s_mov_b32 s33, s32
+; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
-; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:4
-; GCN: s_waitcnt vmcnt(0)
-; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
-; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
-; GCN: s_setpc_b64
-; GCN: ScratchSize: 8
+; FLATSCR: s_mov_b32 s33, s0
+; MUBUF: s_waitcnt vmcnt(0)
+; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
+; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
+; GCN: s_setpc_b64
+; MUBUF: ScratchSize: 8
+; FLATSCR: ScratchSize: 0
define void @callee_need_to_spill_fp_to_memory() #3 {
call void asm sideeffect "; clobber nonpreserved SGPRs",
"~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
@@ -547,20 +563,19 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
; need to spill the FP to memory if there are no free lanes in the reserved
; VGPR.
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
+; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
-; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:[[OFF:[0-9]+]]
-; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
+; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NOT: v_writelane_b32 v40, s33
-; GCN: s_mov_b32 s33, s32
+; MUBUF: s_mov_b32 s33, s32
+; FLATSCR: s_mov_b32 s33, s0
; GCN-NOT: v_readlane_b32 s33, v40
-; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
-; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:[[OFF]]
-; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
-; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
-; GCN: s_setpc_b64
+; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
+; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
+; GCN: s_setpc_b64
define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
"~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
@@ -585,14 +600,14 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
; If the size of the offset exceeds the MUBUF offset field we need another
; scratch VGPR to hold the offset.
; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
-; GCN: s_or_saveexec_b64 s[4:5], -1
+; MUBUF: s_or_saveexec_b64 s[4:5], -1
; MUBUF: v_mov_b32_e32 v0, s33
; GCN-NOT: v_mov_b32_e32 v0, 0x1008
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill
-; FLATSCR-NEXT: s_add_u32 [[SOFF:s[0-9]+]], s32, 0x1008
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s33
-; FLATSCR-NEXT: scratch_store_dword off, v0, [[SOFF]] ; 4-byte Folded Spill
+; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004
+; FLATSCR: v_mov_b32_e32 v0, 0
+; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #3 {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 354bc0f0d7f2..9e3ced2e7f42 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -15,11 +15,11 @@ define <2 x half> @chain_hi_to_lo_private() {
; FLATSCR-LABEL: chain_hi_to_lo_private:
; FLATSCR: ; %bb.0: ; %bb
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT: s_mov_b32 s4, 2
-; FLATSCR-NEXT: scratch_load_ushort v0, off, s4
-; FLATSCR-NEXT: s_mov_b32 s4, 0
+; FLATSCR-NEXT: s_mov_b32 s0, 2
+; FLATSCR-NEXT: scratch_load_ushort v0, off, s0
+; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s4
+; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -256,13 +256,13 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
;
; FLATSCR-LABEL: vload2_private:
; FLATSCR: ; %bb.0: ; %entry
-; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
-; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:4
@@ -272,8 +272,8 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:6
; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:4
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s6
-; FLATSCR-NEXT: v_mov_b32_e32 v1, s7
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s2
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s3
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:8
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 92761a692115..1711f3a517cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -150,10 +150,10 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff
-; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0
-; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
%p.0 = load i16, i16 addrspace(5)* %p, align 1
@@ -251,10 +251,10 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff
-; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0
-; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
%p.0 = load i16, i16 addrspace(5)* %p, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 1e2732e39136..b0532e36ed54 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -1109,13 +1109,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_movk_i32 s4, 0x3000
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: scratch_store_dword off, v0, s32
-; GFX9-NEXT: s_add_u32 s4, s32, s4
+; GFX9-NEXT: s_add_u32 s0, s32, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: scratch_store_dword off, v0, s4 offset:3712
-; GFX9-NEXT: scratch_load_dword v0, off, s4 offset:3712
+; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
+; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1125,12 +1125,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s4, 0x3800
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: s_add_u32 s4, s32, s4
+; GFX10-NEXT: s_add_u32 s0, s32, s0
; GFX10-NEXT: scratch_store_dword off, v0, s32
-; GFX10-NEXT: scratch_store_dword off, v1, s4 offset:1664
-; GFX10-NEXT: scratch_load_dword v0, off, s4 offset:1664
+; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
+; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index e75873ee2ce3..44fe6cdf915b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -1364,8 +1364,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -1413,8 +1413,8 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -1463,8 +1463,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -1614,8 +1614,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -1664,8 +1664,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -1716,8 +1716,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index c725b0c339ff..c8f7fdebc4e4 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -56,37 +56,37 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
;
; FLATSCR-LABEL: local_stack_offset_uses_sp:
; FLATSCR: ; %bb.0: ; %entry
-; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000
-; FLATSCR-NEXT: s_mov_b32 s6, 0
+; FLATSCR-NEXT: s_mov_b32 s2, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi
; FLATSCR-NEXT: BB0_1: ; %loadstoreloop
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
-; FLATSCR-NEXT: s_add_u32 s7, 0x3000, s6
-; FLATSCR-NEXT: s_add_i32 s6, s6, 1
-; FLATSCR-NEXT: s_cmpk_lt_u32 s6, 0x2120
-; FLATSCR-NEXT: scratch_store_byte off, v0, s7
+; FLATSCR-NEXT: s_add_u32 s3, 0x3000, s2
+; FLATSCR-NEXT: s_add_i32 s2, s2, 1
+; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120
+; FLATSCR-NEXT: scratch_store_byte off, v0, s3
; FLATSCR-NEXT: s_cbranch_scc1 BB0_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_movk_i32 s6, 0x20d0
-; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6
-; FLATSCR-NEXT: scratch_load_dword v1, off, s6 offset:4
-; FLATSCR-NEXT: s_movk_i32 s6, 0x2000
-; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6
-; FLATSCR-NEXT: scratch_load_dword v0, off, s6 offset:208
-; FLATSCR-NEXT: s_movk_i32 s6, 0x3000
-; FLATSCR-NEXT: scratch_load_dword v2, off, s6 offset:68
-; FLATSCR-NEXT: s_movk_i32 s6, 0x3000
-; FLATSCR-NEXT: scratch_load_dword v3, off, s6 offset:64
+; FLATSCR-NEXT: s_movk_i32 s2, 0x20d0
+; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
+; FLATSCR-NEXT: scratch_load_dword v1, off, s2 offset:4
+; FLATSCR-NEXT: s_movk_i32 s2, 0x2000
+; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
+; FLATSCR-NEXT: scratch_load_dword v0, off, s2 offset:208
+; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
+; FLATSCR-NEXT: scratch_load_dword v2, off, s2 offset:68
+; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
+; FLATSCR-NEXT: scratch_load_dword v3, off, s2 offset:64
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v2, s4
-; FLATSCR-NEXT: v_mov_b32_e32 v3, s5
+; FLATSCR-NEXT: v_mov_b32_e32 v3, s1
+; FLATSCR-NEXT: v_mov_b32_e32 v2, s0
; FLATSCR-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; FLATSCR-NEXT: s_endpgm
entry:
@@ -146,36 +146,36 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT: s_add_u32 s4, s32, 0x1fff
-; FLATSCR-NEXT: s_mov_b32 s6, s33
-; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffe000
+; FLATSCR-NEXT: s_add_u32 s0, s32, 0x1fff
+; FLATSCR-NEXT: s_mov_b32 s2, s33
+; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffe000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; FLATSCR-NEXT: s_mov_b32 s4, 0
+; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: s_add_u32 s32, s32, 0x6000
; FLATSCR-NEXT: scratch_store_dword off, v2, s33
; FLATSCR-NEXT: BB1_1: ; %loadstoreloop
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; FLATSCR-NEXT: s_add_u32 vcc_hi, s33, 0x1000
-; FLATSCR-NEXT: s_add_u32 s5, vcc_hi, s4
-; FLATSCR-NEXT: s_add_i32 s4, s4, 1
-; FLATSCR-NEXT: s_cmpk_lt_u32 s4, 0x2120
-; FLATSCR-NEXT: scratch_store_byte off, v2, s5
+; FLATSCR-NEXT: s_add_u32 s1, vcc_hi, s0
+; FLATSCR-NEXT: s_add_i32 s0, s0, 1
+; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
+; FLATSCR-NEXT: scratch_store_byte off, v2, s1
; FLATSCR-NEXT: s_cbranch_scc1 BB1_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_movk_i32 s4, 0x20d0
-; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000
-; FLATSCR-NEXT: s_add_u32 s4, s5, s4
-; FLATSCR-NEXT: scratch_load_dword v3, off, s4 offset:4
-; FLATSCR-NEXT: s_movk_i32 s4, 0x2000
-; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000
-; FLATSCR-NEXT: s_add_u32 s4, s5, s4
-; FLATSCR-NEXT: scratch_load_dword v2, off, s4 offset:208
-; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000
-; FLATSCR-NEXT: scratch_load_dword v4, off, s4 offset:68
-; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000
-; FLATSCR-NEXT: scratch_load_dword v5, off, s4 offset:64
+; FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
+; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
+; FLATSCR-NEXT: s_add_u32 s0, s1, s0
+; FLATSCR-NEXT: scratch_load_dword v3, off, s0 offset:4
+; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
+; FLATSCR-NEXT: s_add_u32 s0, s1, s0
+; FLATSCR-NEXT: scratch_load_dword v2, off, s0 offset:208
+; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
+; FLATSCR-NEXT: scratch_load_dword v4, off, s0 offset:68
+; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
+; FLATSCR-NEXT: scratch_load_dword v5, off, s0 offset:64
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000
-; FLATSCR-NEXT: s_mov_b32 s33, s6
+; FLATSCR-NEXT: s_mov_b32 s33, s2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 7aa0c8f70205..4ec62515b103 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -54,36 +54,36 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
;
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
; FLATSCR: ; %bb.0: ; %entry
-; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
+; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; FLATSCR-NEXT: s_mov_b32 s32, 16
; FLATSCR-NEXT: s_mov_b32 s33, 0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: s_cmp_lg_u32 s8, 0
+; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0
; FLATSCR-NEXT: s_cbranch_scc1 BB0_3
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_cmp_lg_u32 s9, 0
+; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
; FLATSCR-NEXT: s_cbranch_scc1 BB0_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: s_mov_b32 s6, s32
-; FLATSCR-NEXT: s_movk_i32 s7, 0x1000
-; FLATSCR-NEXT: s_add_i32 s8, s6, s7
-; FLATSCR-NEXT: s_add_u32 s6, s6, s7
+; FLATSCR-NEXT: s_mov_b32 s2, s32
+; FLATSCR-NEXT: s_movk_i32 s3, 0x1000
+; FLATSCR-NEXT: s_add_i32 s4, s2, s3
+; FLATSCR-NEXT: s_add_u32 s2, s2, s3
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT: scratch_store_dword off, v1, s6
+; FLATSCR-NEXT: scratch_store_dword off, v1, s2
; FLATSCR-NEXT: v_mov_b32_e32 v1, 1
-; FLATSCR-NEXT: s_lshl_b32 s6, s10, 2
-; FLATSCR-NEXT: s_mov_b32 s32, s8
-; FLATSCR-NEXT: scratch_store_dword off, v1, s8 offset:4
-; FLATSCR-NEXT: s_add_i32 s8, s8, s6
-; FLATSCR-NEXT: scratch_load_dword v1, off, s8
-; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2
+; FLATSCR-NEXT: s_mov_b32 s32, s4
+; FLATSCR-NEXT: scratch_store_dword off, v1, s4 offset:4
+; FLATSCR-NEXT: s_add_i32 s4, s4, s2
+; FLATSCR-NEXT: scratch_load_dword v1, off, s4
+; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
-; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; FLATSCR-NEXT: BB0_3: ; %bb.2
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
@@ -162,31 +162,31 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
;
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
; FLATSCR: ; %bb.0: ; %entry
-; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; FLATSCR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; FLATSCR-NEXT: s_mov_b32 s32, 64
; FLATSCR-NEXT: s_mov_b32 s33, 0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: s_cmp_lg_u32 s6, 0
+; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0
; FLATSCR-NEXT: s_cbranch_scc1 BB1_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000
-; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000
+; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT: scratch_store_dword off, v1, s6
+; FLATSCR-NEXT: scratch_store_dword off, v1, s2
; FLATSCR-NEXT: v_mov_b32_e32 v1, 1
-; FLATSCR-NEXT: s_lshl_b32 s7, s7, 2
-; FLATSCR-NEXT: s_mov_b32 s32, s6
-; FLATSCR-NEXT: scratch_store_dword off, v1, s6 offset:4
-; FLATSCR-NEXT: s_add_i32 s6, s6, s7
-; FLATSCR-NEXT: scratch_load_dword v1, off, s6
-; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2
+; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: scratch_store_dword off, v1, s2 offset:4
+; FLATSCR-NEXT: s_add_i32 s2, s2, s3
+; FLATSCR-NEXT: scratch_load_dword v1, off, s2
+; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
-; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; FLATSCR-NEXT: BB1_2: ; %bb.1
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
@@ -261,38 +261,38 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT: s_mov_b32 s9, s33
+; FLATSCR-NEXT: s_mov_b32 s5, s33
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; FLATSCR-NEXT: s_mov_b32 s33, s32
; FLATSCR-NEXT: s_add_u32 s32, s32, 16
-; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
; FLATSCR-NEXT: s_cbranch_execz BB2_3
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
; FLATSCR-NEXT: s_cbranch_execz BB2_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: s_mov_b32 s6, s32
-; FLATSCR-NEXT: s_movk_i32 s7, 0x1000
-; FLATSCR-NEXT: s_add_i32 s8, s6, s7
-; FLATSCR-NEXT: s_add_u32 s6, s6, s7
+; FLATSCR-NEXT: s_mov_b32 s2, s32
+; FLATSCR-NEXT: s_movk_i32 s3, 0x1000
+; FLATSCR-NEXT: s_add_i32 s4, s2, s3
+; FLATSCR-NEXT: s_add_u32 s2, s2, s3
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; FLATSCR-NEXT: scratch_store_dword off, v2, s6
+; FLATSCR-NEXT: scratch_store_dword off, v2, s2
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT: scratch_store_dword off, v2, s8 offset:4
-; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s8
+; FLATSCR-NEXT: scratch_store_dword off, v2, s4 offset:4
+; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5
-; FLATSCR-NEXT: s_mov_b32 s32, s8
+; FLATSCR-NEXT: s_mov_b32 s32, s4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; FLATSCR-NEXT: BB2_3: ; %bb.2
-; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5]
+; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; FLATSCR-NEXT: s_sub_u32 s32, s32, 16
-; FLATSCR-NEXT: s_mov_b32 s33, s9
+; FLATSCR-NEXT: s_mov_b32 s33, s5
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
@@ -361,33 +361,33 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT: s_add_u32 s4, s32, 63
-; FLATSCR-NEXT: s_mov_b32 s7, s33
-; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffffc0
+; FLATSCR-NEXT: s_add_u32 s0, s32, 63
+; FLATSCR-NEXT: s_mov_b32 s3, s33
+; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffffc0
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; FLATSCR-NEXT: s_add_u32 s32, s32, 0x80
-; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
; FLATSCR-NEXT: s_cbranch_execz BB3_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000
-; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000
+; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; FLATSCR-NEXT: scratch_store_dword off, v2, s6
+; FLATSCR-NEXT: scratch_store_dword off, v2, s2
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT: scratch_store_dword off, v2, s6 offset:4
-; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s6
+; FLATSCR-NEXT: scratch_store_dword off, v2, s2 offset:4
+; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4
-; FLATSCR-NEXT: s_mov_b32 s32, s6
+; FLATSCR-NEXT: s_mov_b32 s32, s2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; FLATSCR-NEXT: BB3_2: ; %bb.1
-; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5]
+; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80
-; FLATSCR-NEXT: s_mov_b32 s33, s7
+; FLATSCR-NEXT: s_mov_b32 s33, s3
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 969edbf12647..62213b2e04e5 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -1,10 +1,11 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
; Test that the VGPR spiller correctly switches to SGPR offsets when the
; instruction offset field would overflow, and that it accounts for memory
; swizzling.
-; CHECK-LABEL: test_inst_offset_kernel
+; GCN-LABEL: test_inst_offset_kernel
define amdgpu_kernel void @test_inst_offset_kernel() {
entry:
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
@@ -13,7 +14,8 @@ entry:
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
; Force %a to spill.
@@ -25,7 +27,7 @@ entry:
ret void
}
-; CHECK-LABEL: test_sgpr_offset_kernel
+; GCN-LABEL: test_sgpr_offset_kernel
define amdgpu_kernel void @test_sgpr_offset_kernel() {
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@@ -35,8 +37,10 @@ entry:
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
; 0x40000 / 64 = 4096 (for wave64)
- ; CHECK: s_mov_b32 s6, 0x40000
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
+ ; MUBUF: s_mov_b32 s6, 0x40000
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
+ ; FLATSCR: s_movk_i32 s2, 0x1000
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
; Force %a to spill
@@ -51,7 +55,7 @@ entry:
; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
; pointer to temporarily update, so we just crash.
-; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
+; GCN-LABEL: test_sgpr_offset_function_scavenge_fail
define void @test_sgpr_offset_function_scavenge_fail() #2 {
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@@ -74,9 +78,11 @@ entry:
; 0x40000 / 64 = 4096 (for wave64)
%a = load volatile i32, i32 addrspace(5)* %aptr
- ; CHECK: s_add_u32 s32, s32, 0x40000
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
- ; CHECK: s_sub_u32 s32, s32, 0x40000
+ ; MUBUF: s_add_u32 s32, s32, 0x40000
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
+ ; MUBUF: s_sub_u32 s32, s32, 0x40000
+ ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
%asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
@@ -91,16 +97,18 @@ entry:
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
- ; CHECK: s_add_u32 s32, s32, 0x40000
- ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
- ; CHECK: s_sub_u32 s32, s32, 0x40000
+ ; MUBUF: s_add_u32 s32, s32, 0x40000
+ ; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
+ ; MUBUF: s_sub_u32 s32, s32, 0x40000
+ ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
+ ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
; Force %a to spill with no free SGPRs
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
ret void
}
-; CHECK-LABEL: test_sgpr_offset_subregs_kernel
+; GCN-LABEL: test_sgpr_offset_subregs_kernel
define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
entry:
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
@@ -110,8 +118,11 @@ entry:
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
+ ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
@@ -128,7 +139,7 @@ entry:
ret void
}
-; CHECK-LABEL: test_inst_offset_subregs_kernel
+; GCN-LABEL: test_inst_offset_subregs_kernel
define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
entry:
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
@@ -139,9 +150,12 @@ entry:
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
; 0x3ff00 / 64 = 4092 (for wave64)
- ; CHECK: s_mov_b32 s6, 0x3ff00
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
+ ; MUBUF: s_mov_b32 s6, 0x3ff00
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
+ ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
@@ -158,7 +172,7 @@ entry:
ret void
}
-; CHECK-LABEL: test_inst_offset_function
+; GCN-LABEL: test_inst_offset_function
define void @test_inst_offset_function() {
entry:
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
@@ -167,7 +181,8 @@ entry:
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
; Force %a to spill.
@@ -179,7 +194,7 @@ entry:
ret void
}
-; CHECK-LABEL: test_sgpr_offset_function
+; GCN-LABEL: test_sgpr_offset_function
define void @test_sgpr_offset_function() {
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@@ -189,8 +204,10 @@ entry:
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
; 0x40000 / 64 = 4096 (for wave64)
- ; CHECK: s_add_u32 s4, s32, 0x40000
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
+ ; MUBUF: s_add_u32 s4, s32, 0x40000
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
+ ; FLATSCR: s_add_u32 s0, s32, 0x1000
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
; Force %a to spill
@@ -202,7 +219,7 @@ entry:
ret void
}
-; CHECK-LABEL: test_sgpr_offset_subregs_function
+; GCN-LABEL: test_sgpr_offset_subregs_function
define void @test_sgpr_offset_subregs_function() {
entry:
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
@@ -212,8 +229,10 @@ entry:
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4088 ; 4-byte Folded Spill
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4092 ; 4-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
@@ -230,7 +249,7 @@ entry:
ret void
}
-; CHECK-LABEL: test_inst_offset_subregs_function
+; GCN-LABEL: test_inst_offset_subregs_function
define void @test_inst_offset_subregs_function() {
entry:
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
@@ -241,9 +260,12 @@ entry:
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
; 0x3ff00 / 64 = 4092 (for wave64)
- ; CHECK: s_add_u32 s4, s32, 0x3ff00
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
- ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
+ ; MUBUF: s_add_u32 s4, s32, 0x3ff00
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
+ ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s32, 0xffc
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
+ ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index 62b5222f9621..8c4c7069fffa 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -60,26 +60,18 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
; FLATSCR-NEXT: s_addc_u32 s3, s3, 0
; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24
-; FLATSCR-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; FLATSCR-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; FLATSCR-NEXT: s_mov_b32 s38, -1
-; FLATSCR-NEXT: s_mov_b32 s39, 0x31c16000
-; FLATSCR-NEXT: s_add_u32 s36, s36, s5
-; FLATSCR-NEXT: s_addc_u32 s37, s37, 0
+; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000
; FLATSCR-NEXT: v_mov_b32_e32 v3, 0
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0x400000
; FLATSCR-NEXT: ; implicit-def: $vcc_hi
-; FLATSCR-NEXT: s_getpc_b64 s[4:5]
-; FLATSCR-NEXT: s_add_u32 s4, s4, svm_eval_nodes at rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s5, s5, svm_eval_nodes at rel32@hi+12
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
-; FLATSCR-NEXT: s_mov_b64 s[0:1], s[36:37]
-; FLATSCR-NEXT: s_mov_b64 s[2:3], s[38:39]
-; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s2
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo
; FLATSCR-NEXT: s_cbranch_execz BB0_2
More information about the llvm-commits
mailing list