[llvm] [AMDGPU] Insert spill codes for the SGPRs used for EXEC copy (PR #79428)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 25 01:29:23 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Christudasan Devadasan (cdevadas)
<details>
<summary>Changes</summary>
The SGPR registers used for preserving EXEC mask while lowering the whole-wave register spills and copies should be preserved at the prolog and epilog if they are in the CSR range. It isn't happening when there is only wwm-copy lowered and there are no wwm-spills. This patch addresses that problem.
---
Full diff: https://github.com/llvm/llvm-project/pull/79428.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+10-6)
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.h (+2-2)
- (modified) llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp (+5)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+5-4)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+14)
- (modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (+25-11)
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll (+6-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9d062eb156d5c5a..9bd8996f22fca25 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1487,8 +1487,7 @@ void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
// The special SGPR spills like the one needed for FP, BP or any reserved
// registers delayed until frame lowering.
void SIFrameLowering::determinePrologEpilogSGPRSaves(
- MachineFunction &MF, BitVector &SavedVGPRs,
- bool NeedExecCopyReservedReg) const {
+ MachineFunction &MF, BitVector &SavedVGPRs) const {
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -1504,9 +1503,10 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
- if (NeedExecCopyReservedReg) {
+ if (MFI->shouldPreserveExecCopyReservedReg()) {
Register ReservedReg = MFI->getSGPRForEXECCopy();
assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
+ MRI.reserveReg(ReservedReg, TRI);
Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
if (UnusedScratchReg) {
// If found any unused scratch SGPR, reserve the register itself for Exec
@@ -1570,7 +1570,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
- bool NeedExecCopyReservedReg = false;
+ bool NeedsExecCopyReservedReg = false;
MachineInstr *ReturnMI = nullptr;
for (MachineBasicBlock &MBB : MF) {
@@ -1588,7 +1588,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
- NeedExecCopyReservedReg = true;
+ NeedsExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
(MFI->isChainFunction() &&
@@ -1602,6 +1602,10 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
}
+ // If found any wwm-spills, preserve the register(s) used for exec copy.
+ if (NeedsExecCopyReservedReg)
+ MFI->setPreserveExecCopyReservedReg();
+
// Remove any VGPRs used in the return value because these do not need to be saved.
// This prevents CSR restore from clobbering return VGPRs.
if (ReturnMI) {
@@ -1620,7 +1624,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (!ST.hasGFX90AInsts())
SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
- determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
+ determinePrologEpilogSGPRSaves(MF, SavedVGPRs);
// The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
// allow the default insertion to handle them.
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed811fb..926801dcecc3386 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -34,8 +34,8 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
RegScavenger *RS = nullptr) const override;
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const;
- void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs,
- bool NeedExecCopyReservedReg) const;
+ void determinePrologEpilogSGPRSaves(MachineFunction &MF,
+ BitVector &SavedRegs) const;
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, DebugLoc &DL,
LiveRegUnits &LiveUnits, Register FrameReg,
diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
index 9c3cd1bbd6b0fb6..acea9103a925d66 100644
--- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
@@ -137,5 +137,10 @@ bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) {
}
}
+ // SGPRs reserved for exec copy should be preserved as we encountered WWM_COPY
+ // instances.
+ if (Changed)
+ MFI->setPreserveExecCopyReservedReg();
+
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b94d143a75e5edf..803feaf9cc5a968 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -671,8 +671,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
const llvm::MachineFunction &MF)
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
- GDSSize(MFI.getGDSSize()),
- DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
+ GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
+ IsEntryFunction(MFI.isEntryFunction()),
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
@@ -684,9 +684,9 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
BytesInStackArgArea(MFI.getBytesInStackArgArea()),
ReturnsVoid(MFI.returnsVoid()),
+ PreserveExecCopyReservedReg(MFI.shouldPreserveExecCopyReservedReg()),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
- PSInputAddr(MFI.getPSInputAddr()),
- PSInputEnable(MFI.getPSInputEnable()),
+ PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
Mode(MFI.getMode()) {
for (Register Reg : MFI.getWWMReservedRegs())
WWMReservedRegs.push_back(regToString(Reg, TRI));
@@ -728,6 +728,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
+ PreserveExecCopyReservedReg = YamlMFI.PreserveExecCopyReservedReg;
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0336ec4985ea747..fcd6422ab2cc2a7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -277,6 +277,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
unsigned BytesInStackArgArea = 0;
bool ReturnsVoid = true;
+ bool PreserveExecCopyReservedReg = false;
std::optional<SIArgumentInfo> ArgInfo;
@@ -321,6 +322,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue("$sp_reg"));
YamlIO.mapOptional("bytesInStackArgArea", MFI.BytesInStackArgArea, 0u);
YamlIO.mapOptional("returnsVoid", MFI.ReturnsVoid, true);
+ YamlIO.mapOptional("preserveExecCopyReservedReg",
+ MFI.PreserveExecCopyReservedReg, false);
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u);
YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u);
@@ -530,6 +533,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// To save/restore EXEC MASK around WWM spills and copies.
Register SGPRForEXECCopy;
+ // To save/restore the exec copy reserved register at function entry/exit.
+ // Since the handling of wwm copies and spills are scattered across multiple
+ // passes, this field would help efficiently insert its spills.
+ bool PreserveExecCopyReservedReg = false;
+
DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
// AGPRs used for VGPR spills.
@@ -688,6 +696,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
void setSGPRForEXECCopy(Register Reg) { SGPRForEXECCopy = Reg; }
+ void setPreserveExecCopyReservedReg() { PreserveExecCopyReservedReg = true; }
+
+ bool shouldPreserveExecCopyReservedReg() const {
+ return PreserveExecCopyReservedReg;
+ }
+
ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const {
return SpillVGPR;
}
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index b63c8038f9c81bd..5d580f838e0e0e7 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -32,9 +32,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: v_writelane_b32 v2, s24, 5
; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11]
; GFX906-NEXT: v_writelane_b32 v2, s26, 6
+; GFX906-NEXT: v_writelane_b32 v41, s34, 2
; GFX906-NEXT: v_writelane_b32 v2, s27, 7
+; GFX906-NEXT: v_writelane_b32 v41, s35, 3
; GFX906-NEXT: v_writelane_b32 v2, s8, 8
-; GFX906-NEXT: v_writelane_b32 v41, s16, 2
+; GFX906-NEXT: v_writelane_b32 v41, s16, 4
; GFX906-NEXT: v_writelane_b32 v2, s9, 9
; GFX906-NEXT: v_writelane_b32 v41, s30, 0
; GFX906-NEXT: v_writelane_b32 v2, s4, 10
@@ -338,7 +340,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: v_readlane_b32 s31, v41, 1
; GFX906-NEXT: v_readlane_b32 s30, v41, 0
; GFX906-NEXT: ; kill: killed $vgpr40
-; GFX906-NEXT: v_readlane_b32 s4, v41, 2
+; GFX906-NEXT: v_readlane_b32 s34, v41, 2
+; GFX906-NEXT: v_readlane_b32 s35, v41, 3
+; GFX906-NEXT: v_readlane_b32 s4, v41, 4
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -379,23 +383,27 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_mov_b64 exec, -1
; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
; GFX908-NEXT: s_mov_b64 exec, s[18:19]
-; GFX908-NEXT: v_mov_b32_e32 v3, s16
+; GFX908-NEXT: v_mov_b32_e32 v3, s34
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; GFX908-NEXT: v_mov_b32_e32 v3, s35
+; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; GFX908-NEXT: v_mov_b32_e32 v3, s16
+; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
; GFX908-NEXT: s_addk_i32 s32, 0x2c00
; GFX908-NEXT: s_mov_b64 s[16:17], exec
; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:172
; GFX908-NEXT: v_writelane_b32 v2, s30, 0
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[16:17]
; GFX908-NEXT: s_mov_b64 s[16:17], exec
; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:172
; GFX908-NEXT: v_writelane_b32 v2, s31, 0
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[16:17]
; GFX908-NEXT: ; implicit-def: $vgpr2
@@ -729,25 +737,31 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readlane_b32 s31, v0, 0
-; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
; GFX908-NEXT: s_mov_b64 s[4:5], exec
; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readlane_b32 s30, v0, 0
-; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
; GFX908-NEXT: ; kill: killed $vgpr40
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_readfirstlane_b32 s34, v0
+; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_readfirstlane_b32 s35, v0
+; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s4, v0
; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1
; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index cfb1cffdb509648..f680bbdd05cdd22 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -18,7 +18,9 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[18:19]
-; GFX90A-NEXT: v_writelane_b32 v40, s16, 2
+; GFX90A-NEXT: v_writelane_b32 v40, s28, 2
+; GFX90A-NEXT: v_writelane_b32 v40, s29, 3
+; GFX90A-NEXT: v_writelane_b32 v40, s16, 4
; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
; GFX90A-NEXT: s_addk_i32 s32, 0x400
@@ -46,7 +48,9 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-NEXT: ; kill: killed $vgpr0
-; GFX90A-NEXT: v_readlane_b32 s4, v40, 2
+; GFX90A-NEXT: v_readlane_b32 s28, v40, 2
+; GFX90A-NEXT: v_readlane_b32 s29, v40, 3
+; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, -1
``````````
</details>
https://github.com/llvm/llvm-project/pull/79428
More information about the llvm-commits
mailing list