[llvm] Split vgpr regalloc pipeline (PR #93526)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 28 03:02:12 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Christudasan Devadasan (cdevadas)
<details>
<summary>Changes</summary>
Allocating wwm-registers and per-thread VGPR operands
together imposes many challenges in the way the
registers are reused during allocation. There are
times when regalloc reuses the registers of regular
VGPRs operations for wwm-operations in a small range
leading to unwantedly clobbering their inactive lanes
causing correctness issues that are hard to trace.
This patch splits the VGPR allocation pipeline further
to allocate wwm-registers first and the regular VGPR
operands in a separate pipeline. The splitting would
ensure that the physical registers used for wwm
allocations won't take part in the next allocation
pipeline to avoid any such clobbering.
---
Patch is 1.41 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93526.diff
85 Files Affected:
- (modified) llvm/include/llvm/CodeGen/MachineRegisterInfo.h (+2)
- (modified) llvm/include/llvm/CodeGen/RegAllocCommon.h (+6-3)
- (modified) llvm/lib/CodeGen/RegAllocBase.cpp (+1-2)
- (modified) llvm/lib/CodeGen/RegAllocFast.cpp (+1-2)
- (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+3-3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+4)
- (added) llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp (+104)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+99-9)
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+38-24)
- (modified) llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp (+123-45)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+21-8)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+14-3)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+44-24)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.h (+5)
- (modified) llvm/lib/Target/RISCV/RISCVTargetMachine.cpp (+4-2)
- (modified) llvm/lib/Target/X86/X86TargetMachine.cpp (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll (+103-116)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+8-7)
- (modified) llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll (+19-22)
- (modified) llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll (+77-107)
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+309-354)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+1852-1919)
- (modified) llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir (+37-43)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir (+26-14)
- (modified) llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+44-44)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir (+7-21)
- (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+260-262)
- (modified) llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll (+7-30)
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+22)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+277-295)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+318-341)
- (modified) llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll (+14-21)
- (modified) llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir (+12-28)
- (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir (+3-19)
- (modified) llvm/test/CodeGen/AMDGPU/pr51516.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir (-6)
- (modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (+355-359)
- (modified) llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+1389-1409)
- (modified) llvm/test/CodeGen/AMDGPU/remat-vop.mir (+1548-1542)
- (modified) llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll (+26-4)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir (+9-8)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir (+153-154)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir (+1-3)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll (+27-35)
- (modified) llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll (+6-8)
- (modified) llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir (+7-10)
- (modified) llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir (+10-12)
- (modified) llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir (+3-9)
- (modified) llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll (+15-22)
- (modified) llvm/test/CodeGen/AMDGPU/spill192.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/spill224.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/spill288.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/spill320.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/spill352.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/spill384.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+53-100)
- (modified) llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+21-23)
- (modified) llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll (+14-21)
- (modified) llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir (+7-6)
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll (+11-12)
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll (+24-28)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+226-245)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+210-292)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 09d9a0b4ec402..01d91982ae1c7 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -184,6 +184,8 @@ class MachineRegisterInfo {
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
}
+ const MachineFunction &getMF() const { return *MF; }
+
//===--------------------------------------------------------------------===//
// Function State
//===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/CodeGen/RegAllocCommon.h b/llvm/include/llvm/CodeGen/RegAllocCommon.h
index 757ca8e112eec..f3423083eef3a 100644
--- a/llvm/include/llvm/CodeGen/RegAllocCommon.h
+++ b/llvm/include/llvm/CodeGen/RegAllocCommon.h
@@ -10,22 +10,25 @@
#define LLVM_CODEGEN_REGALLOCCOMMON_H
#include <functional>
+#include <llvm/CodeGen/Register.h>
namespace llvm {
class TargetRegisterClass;
class TargetRegisterInfo;
+class MachineRegisterInfo;
typedef std::function<bool(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC)> RegClassFilterFunc;
+ const MachineRegisterInfo &MRI, const Register Reg)>
+ RegClassFilterFunc;
/// Default register class filter function for register allocation. All virtual
/// registers should be allocated.
static inline bool allocateAllRegClasses(const TargetRegisterInfo &,
- const TargetRegisterClass &) {
+ const MachineRegisterInfo &,
+ const Register) {
return true;
}
-
}
#endif // LLVM_CODEGEN_REGALLOCCOMMON_H
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index d0dec372f6896..a4645ed93029d 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -181,8 +181,7 @@ void RegAllocBase::enqueue(const LiveInterval *LI) {
if (VRM->hasPhys(Reg))
return;
- const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
- if (ShouldAllocateClass(*TRI, RC)) {
+ if (ShouldAllocateClass(*TRI, *MRI, Reg)) {
LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n');
enqueueImpl(LI);
} else {
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 6740e1f0edb4f..f6419daba6a2d 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -417,8 +417,7 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
bool RegAllocFast::shouldAllocateRegister(const Register Reg) const {
assert(Reg.isVirtual());
- const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
- return ShouldAllocateClass(*TRI, RC);
+ return ShouldAllocateClass(*TRI, *MRI, Reg);
}
void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 348277224c7ae..b3bf1899ceeaf 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2306,9 +2306,9 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
if (Reg.isPhysical())
continue;
- // This may be a skipped class
+ // This may be a skipped register.
if (!VRM->hasPhys(Reg)) {
- assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) &&
+ assert(!ShouldAllocateClass(*TRI, *MRI, Reg) &&
"We have an unallocated variable which should have been handled");
continue;
}
@@ -2698,7 +2698,7 @@ bool RAGreedy::hasVirtRegAlloc() {
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
if (!RC)
continue;
- if (ShouldAllocateClass(*TRI, *RC))
+ if (ShouldAllocateClass(*TRI, *MRI, Reg))
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6016bd5187d88..cd9f3fb162fd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -56,6 +56,7 @@ ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+FunctionPass *createAMDGPUReserveWWMRegsPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
@@ -149,6 +150,9 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};
+void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
+extern char &AMDGPUReserveWWMRegsID;
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
new file mode 100644
index 0000000000000..5ed8cd4231d00
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
@@ -0,0 +1,104 @@
+//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to the reserved regs list
+//---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass should be invoked at the end of wwm-regalloc pipeline.
+/// It identifies the WWM regs allocated during this pipeline and add
+/// them to the list of reserved registers so that they won't be available for
+/// per-thread VGPR allocation in the subsequent regalloc pipeline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"
+
+namespace {
+
+class AMDGPUReserveWWMRegs : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
+ initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Reserve WWM Registers";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
+ "AMDGPU Reserve WWM Registers", false, false)
+
+char AMDGPUReserveWWMRegs::ID = 0;
+
+char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;
+
+bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ unsigned Opc = MI.getOpcode();
+ if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
+ Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
+ continue;
+
+ Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
+ ? MI.getOperand(0).getReg()
+ : MI.getOperand(1).getReg();
+
+ assert(Reg.isPhysical() &&
+ "All WWM registers should have been allocated by now.");
+
+ MFI->reserveWWMRegister(Reg);
+ Changed |= true;
+ }
+ }
+
+ // Reset the renamable flag for MOs involving wwm-regs to get rid of the MIR
+ // Verifier error.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical() &&
+ llvm::is_contained(MFI->getWWMReservedRegs(), Reg))
+ MO.setIsRenamable(false);
+ }
+ }
+ }
+
+ // Now clear the NonWWMRegMask earlier set during wwm-regalloc.
+ MFI->clearNonWWMRegAllocMask();
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbbfe34a63863..e3375c758b8d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -82,24 +82,44 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
: RegisterRegAllocBase(N, D, C) {}
};
+class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
+public:
+ WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+ : RegisterRegAllocBase(N, D, C) {}
+};
+
static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC) {
- return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}
static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
- const TargetRegisterClass &RC) {
- return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}
+static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI,
+ const Register Reg) {
+ const SIMachineFunctionInfo *MFI =
+ MRI.getMF().getInfo<SIMachineFunctionInfo>();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
+ MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+}
-/// -{sgpr|vgpr}-regalloc=... command line option.
+/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
/// A dummy default pass factory indicates whether the register allocator is
/// overridden on the command line.
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
static SGPRRegisterRegAlloc
defaultSGPRRegAlloc("default",
@@ -116,6 +136,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for VGPRs"));
+static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
+ RegisterPassParser<WWMRegisterRegAlloc>>
+ WWMRegAlloc("wwm-regalloc", cl::Hidden,
+ cl::init(&useDefaultRegisterAllocator),
+ cl::desc("Register allocator to use for WWM registers"));
static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -135,6 +160,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
}
}
+static void initializeDefaultWWMRegisterAllocatorOnce() {
+ RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+
+ if (!Ctor) {
+ Ctor = WWMRegAlloc;
+ WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
+ }
+}
+
static FunctionPass *createBasicSGPRRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateSGPRs);
}
@@ -159,6 +193,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
}
+static FunctionPass *createBasicWWMRegisterAllocator() {
+ return createBasicRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createGreedyWWMRegisterAllocator() {
+ return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createFastWWMRegisterAllocator() {
+ return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
+}
+
static SGPRRegisterRegAlloc basicRegAllocSGPR(
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
@@ -175,7 +221,15 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(
static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
-}
+static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
+ "basic register allocator",
+ createBasicWWMRegisterAllocator);
+static WWMRegisterRegAlloc
+ greedyRegAllocWWMReg("greedy", "greedy register allocator",
+ createGreedyWWMRegisterAllocator);
+static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
+ createFastWWMRegisterAllocator);
+} // namespace
static cl::opt<bool>
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
@@ -424,6 +478,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
+ initializeAMDGPUReserveWWMRegsPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
@@ -923,6 +978,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
FunctionPass *createSGPRAllocPass(bool Optimized);
FunctionPass *createVGPRAllocPass(bool Optimized);
+ FunctionPass *createWWMRegAllocPass(bool Optimized);
FunctionPass *createRegAllocPass(bool Optimized) override;
bool addRegAssignAndRewriteFast() override;
@@ -1331,7 +1387,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}
bool GCNPassConfig::addPreRewrite() {
- addPass(&SILowerWWMCopiesID);
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
@@ -1367,12 +1422,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
return createFastVGPRRegisterAllocator();
}
+FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
+ // Initialize the global default.
+ llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
+ initializeDefaultWWMRegisterAllocatorOnce);
+
+ RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+ if (Ctor != useDefaultRegisterAllocator)
+ return Ctor();
+
+ if (Optimized)
+ return createGreedyWWMRegisterAllocator();
+
+ return createFastWWMRegisterAllocator();
+}
+
FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
llvm_unreachable("should not be used");
}
static const char RegAllocOptNotSupportedMessage[] =
- "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
+ "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
+ "and -vgpr-regalloc";
bool GCNPassConfig::addRegAssignAndRewriteFast() {
if (!usingDefaultRegAlloc())
@@ -1384,11 +1455,20 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
+
+ // To Allocate wwm registers used in whole quad mode operations (for pixel
+ // shaders).
addPass(&SIPreAllocateWWMRegsID);
- addPass(createVGPRAllocPass(false));
+ // For allocating other wwm register operands.
+ addPass(createWWMRegAllocPass(false));
addPass(&SILowerWWMCopiesID);
+ addPass(&AMDGPUReserveWWMRegsID);
+
+ // For allocating per-thread VGPRs.
+ addPass(createVGPRAllocPass(false));
+
return true;
}
@@ -1408,8 +1488,18 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
+
+ // To Allocate wwm registers used in whole quad mode operations (for pixel
+ // shaders).
addPass(&SIPreAllocateWWMRegsID);
+ // For allocating other whole wave mode registers.
+ addPass(createWWMRegAllocPass(true));
+ addPass(&SILowerWWMCopiesID);
+ addPass(createVirtRegRewriter(false));
+ addPass(&AMDGPUReserveWWMRegsID);
+
+ // For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(true));
addPreRewrite();
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c992352cb78da..178af07048571 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -94,6 +94,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURegBankSelect.cpp
AMDGPURegisterBankInfo.cpp
AMDGPURemoveIncompatibleFunctions.cpp
+ AMDGPUReserveWWMRegs.cpp
AMDGPUResourceUsageAnalysis.cpp
AMDGPURewriteOutArguments.cpp
AMDGPURewriteUndefForPHI.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index eae666ab0e7d7..fbb6c3d9fe24b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1555,6 +1555,17 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
}
}
+// Mark all WWM VGPRs as BB LiveIns.
+static void addWwmRegBBLiveIn(MachineFunction &MF) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto &Reg : MFI->getWWMReservedRegs())
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+ }
+}
+
// Only report VGPRs to generic code.
void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedVGPRs,
@@ -1567,11 +1578,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
return;
- MFI->shiftSpillPhysVGPRsToLowestRange(MF);
-
TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
- if (MFI->isEntryFunction())
- return;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1581,19 +1588,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
MachineInstr *ReturnMI = nullptr;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- // WRITELANE instructions used for SGPR spills can overwrite the inactive
- // lanes of VGPRs and callee must spill and restore them even if they are
- // marked Caller-saved.
-
- // TODO: Handle this elsewhere at an early point. Walking through all MBBs
- // here would be a bad heuristic. A better way should be by calling
- // allocateWWMSpill during the regalloc pipeline whenever a physical
- // register is allocated for the intended virtual registers.
- if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
- MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
- else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
- MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
- else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
+ // TODO: Walking through all MBBs here would be a bad heuristic. Better
+ // handle them elsewhere.
+ if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
@@ -1608,6 +1605,25 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
}
+ SmallVector<Register> SortedWWMVGPRs;
+ for (auto &Reg : MFI->getWWMReservedRegs()) {
+ // The shift-back is needed only for the VGPRs used for SGPR spills and they
+ // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
+ // reserved registers.
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+ if (TRI->getRegSizeInBits(*RC) > 32)
+ continue;
+ SortedWWMVGPRs.push_back(Reg);
+...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/93526
More information about the llvm-commits
mailing list