[llvm] r361655 - AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri May 24 11:18:52 PDT 2019
Author: arsenm
Date: Fri May 24 11:18:51 2019
New Revision: 361655
URL: http://llvm.org/viewvc/llvm-project?rev=361655&view=rev
Log:
AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills
If some lanes weren't active on entry to the function, this could
clobber their VGPR values.
Modified:
llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll
llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll
llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll
llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp?rev=361655&r1=361654&r2=361655&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp Fri May 24 11:18:51 2019
@@ -523,22 +523,20 @@ void SIFrameLowering::emitEntryFunctionS
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
- MachineFunction *MF = MBB.getParent();
-
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
+static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
+ LivePhysRegs &LiveRegs,
+ const TargetRegisterClass &RC) {
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
- LivePhysRegs LiveRegs(TRI);
- LiveRegs.addLiveIns(MBB);
// Mark callee saved registers as used so we will not choose them.
- const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
for (unsigned i = 0; CSRegs[i]; ++i)
LiveRegs.addReg(CSRegs[i]);
- MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
- for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+ for (unsigned Reg : RC) {
if (LiveRegs.available(MRI, Reg))
return Reg;
}
@@ -561,6 +559,7 @@ void SIFrameLowering::emitPrologue(Machi
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+ LivePhysRegs LiveRegs;
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
@@ -578,7 +577,12 @@ void SIFrameLowering::emitPrologue(Machi
RoundedSize += Alignment;
- unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+
+ unsigned ScratchSPReg
+ = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+ AMDGPU::SReg_32_XM0RegClass);
assert(ScratchSPReg != AMDGPU::NoRegister);
// s_add_u32 tmp_reg, s32, NumBytes
@@ -609,13 +613,33 @@ void SIFrameLowering::emitPrologue(Machi
.setMIFlag(MachineInstr::FrameSetup);
}
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
- continue;
- TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
+ if (!FuncInfo->getSGPRSpillVGPRs().empty()) {
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ }
+
+ // To avoid clobbering VGPRs in lanes that weren't active on function entry,
+ // turn on all lanes before doing the spill to memory.
+ unsigned ScratchExecCopy
+ = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+ AMDGPU::SReg_64_XEXECRegClass);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
+ .addImm(-1);
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+ TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
+ Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+ &TII->getRegisterInfo());
+ }
+
+ // FIXME: Split block and make terminator.
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(ScratchExecCopy);
}
}
@@ -628,14 +652,32 @@ void SIFrameLowering::emitEpilogue(Machi
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc DL;
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
- continue;
- TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
+ if (!FuncInfo->getSGPRSpillVGPRs().empty()) {
+ // See emitPrologue
+ LivePhysRegs LiveRegs(*ST.getRegisterInfo());
+ LiveRegs.addLiveIns(MBB);
+
+ unsigned ScratchExecCopy
+ = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+ AMDGPU::SReg_64_XEXECRegClass);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
+ .addImm(-1);
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+ TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
+ Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+ &TII->getRegisterInfo());
+ }
+
+ // FIXME: Split block and make terminator.
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(ScratchExecCopy);
}
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
@@ -645,8 +687,6 @@ void SIFrameLowering::emitEpilogue(Machi
const MachineFrameInfo &MFI = MF.getFrameInfo();
uint32_t NumBytes = MFI.getStackSize();
- DebugLoc DL;
-
// FIXME: Clarify distinction between no set SP and SP. For callee functions,
// it's really whether we need SP to be accurate or not.
Modified: llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll?rev=361655&r1=361654&r2=361655&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll Fri May 24 11:18:51 2019
@@ -30,11 +30,11 @@ entry:
; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
; GCN: s_mov_b32 s5, s32
+; GCN: s_add_u32 s32, s32, 0xc00{{$}}
; GCN-DAG: buffer_store_dword v32
; GCN-DAG: buffer_store_dword v33
; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
; GCN-DAG: v_writelane_b32
-; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
Modified: llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll?rev=361655&r1=361654&r2=361655&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll Fri May 24 11:18:51 2019
@@ -38,8 +38,8 @@ define amdgpu_kernel void @test_kernel_c
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: s_mov_b32 s5, s33
-; GCN: v_readlane_b32 s37, v32, 4
+; GCN-DAG: s_mov_b32 s5, s33
+; GCN-DAG: v_readlane_b32 s37, v32, 4
; GCN: v_readlane_b32 s36, v32, 3
; GCN: v_readlane_b32 s35, v32, 2
; GCN: v_readlane_b32 s34, v32, 1
@@ -59,7 +59,7 @@ define void @test_func_call_external_voi
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: s_mov_b32 s5, s33
+; GCN: s_mov_b32 s5, s33
define void @test_func_call_external_void_funcx2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
@@ -175,7 +175,7 @@ define amdgpu_kernel void @test_call_voi
; GCN-NEXT: ; clobber
; GCN-NEXT: #ASMEND
; GCN-NEXT: v_readlane_b32 s33, v0, 0
-; GCN-NEXT: s_setpc_b64
+; GCN: s_setpc_b64
define hidden void @void_func_void_clobber_s33() #2 {
call void asm sideeffect "; clobber", "~{s33}"() #0
ret void
Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll?rev=361655&r1=361654&r2=361655&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll Fri May 24 11:18:51 2019
@@ -37,19 +37,19 @@ define void @callee_with_stack() #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
; GCN: s_mov_b32 s5, s32
+; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8
; GCN-DAG: v_writelane_b32 v32, s33,
; GCN-DAG: v_writelane_b32 v32, s34,
; GCN-DAG: v_writelane_b32 v32, s35,
-; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: s_mov_b32 s33, s5
; GCN: s_swappc_b64
-; GCN: s_mov_b32 s5, s33
+; GCN-DAG: s_mov_b32 s5, s33
; GCN-DAG: v_readlane_b32 s35,
; GCN-DAG: v_readlane_b32 s34,
; GCN-DAG: v_readlane_b32 s33,
@@ -72,7 +72,9 @@ define void @callee_with_stack_and_call(
; GCN-LABEL: {{^}}callee_no_stack_with_call:
; GCN: s_waitcnt
; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-DAG: v_writelane_b32 v32, s33, 0
; GCN-DAG: v_writelane_b32 v32, s34, 1
; GCN: s_mov_b32 s33, s5
@@ -81,9 +83,12 @@ define void @callee_with_stack_and_call(
; GCN-DAG: v_readlane_b32 s34, v32, 1
; GCN-DAG: v_readlane_b32 s33, v32, 0
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
-; GCN: s_sub_u32 s32, s32, 0x400
+; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+
+; GCN: s_sub_u32 s32, s32, 0x400
; GCN: s_setpc_b64
define void @callee_no_stack_with_call() #0 {
call void @external_void_func_void()
@@ -94,11 +99,18 @@ declare void @external_void_func_void()
; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored
; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+
; GCN: v_writelane_b32 v32
; GCN: ;;#ASMSTART
; GCN: v_readlane_b32 s{{[0-9]+}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+
+; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll?rev=361655&r1=361654&r2=361655&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll Fri May 24 11:18:51 2019
@@ -326,8 +326,8 @@ define void @func_call_too_many_args_use
; Requires loading and storing to stack slot.
; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: s_add_u32 s32, s32, 0x400{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}}
Modified: llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll?rev=361655&r1=361654&r2=361655&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll Fri May 24 11:18:51 2019
@@ -28,10 +28,12 @@ define float @call_split_type_used_outsi
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x400
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
-; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v32, s35, 2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v2f32 at rel32@lo+4
@@ -39,12 +41,14 @@ define float @call_split_type_used_outsi
; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
; GCN-NEXT: v_readlane_b32 s35, v32, 2
+; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -62,10 +66,12 @@ define float @call_split_type_used_outsi
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x400
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
-; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v32, s35, 2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v3f32 at rel32@lo+4
@@ -73,12 +79,14 @@ define float @call_split_type_used_outsi
; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
; GCN-NEXT: v_readlane_b32 s35, v32, 2
+; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -96,10 +104,12 @@ define half @call_split_type_used_outsid
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x400
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
-; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v32, s35, 2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v4f16 at rel32@lo+4
@@ -107,12 +117,14 @@ define half @call_split_type_used_outsid
; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
; GCN-NEXT: v_readlane_b32 s35, v32, 2
+; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -130,10 +142,12 @@ define { i32, half } @call_split_type_us
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x400
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
-; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v32, s35, 2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_struct at rel32@lo+4
@@ -141,13 +155,15 @@ define { i32, half } @call_split_type_us
; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
; GCN-NEXT: v_readlane_b32 s35, v32, 2
+; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: v_readlane_b32 s34, v32, 1
+; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: v_readlane_b32 s33, v32, 0
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: v_mov_b32_e32 v1, v4
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
Modified: llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll?rev=361655&r1=361654&r2=361655&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll Fri May 24 11:18:51 2019
@@ -10,9 +10,12 @@ declare void @external_void_func_i32(i32
; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
; GCN: s_waitcnt
; GCN: s_mov_b32 s5, s32
-; Spill CSR VGPR used for SGPR spilling
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
; GCN-DAG: s_add_u32 s32, s32, 0x400
+; Spill CSR VGPR used for SGPR spilling
+; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+
; GCN-DAG: v_writelane_b32 v32, s33, 0
; GCN-DAG: v_writelane_b32 v32, s34, 1
; GCN-DAG: v_writelane_b32 v32, s35, 2
@@ -22,7 +25,10 @@ declare void @external_void_func_i32(i32
; GCN: v_readlane_b32 s35, v32, 2
; GCN: v_readlane_b32 s34, v32, 1
; GCN: v_readlane_b32 s33, v32, 0
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+
; GCN: s_sub_u32 s32, s32, 0x400
; GCN: s_setpc_b64
define void @test_func_call_external_void_func_i32_imm() #0 {
Modified: llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll?rev=361655&r1=361654&r2=361655&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll Fri May 24 11:18:51 2019
@@ -207,13 +207,17 @@ entry:
; Have another non-tail in the function
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12
+; GCN: s_add_u32 s32, s32, 0x400
+
+; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:12
+; GCN-NEXT: s_mov_b64 exec
+
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN-DAG: v_writelane_b32 v34, s33, 0
; GCN-DAG: v_writelane_b32 v34, s34, 1
; GCN-DAG: v_writelane_b32 v34, s35, 2
-; GCN-DAG: s_add_u32 s32, s32, 0x400
; GCN-DAG: s_getpc_b64
; GCN: s_swappc_b64
@@ -228,7 +232,10 @@ entry:
; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
-; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12
+; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:12
+; GCN-NEXT: s_mov_b64 exec
+
; GCN: s_sub_u32 s32, s32, 0x400
; GCN: s_setpc_b64 s[6:7]
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
More information about the llvm-commits
mailing list