[llvm] Split vgpr regalloc pipeline (PR #93526)

Tue May 28 03:02:12 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Christudasan Devadasan (cdevadas)

<details>
<summary>Changes</summary>

Allocating wwm-registers and per-thread VGPR operands
together imposes many challenges in the way the
registers are reused during allocation. There are
times when regalloc reuses the registers of regular
VGPRs operations for wwm-operations in a small range
leading to unwantedly clobbering their inactive lanes
causing correctness issues that are hard to trace.

This patch splits the VGPR allocation pipeline further
to allocate wwm-registers first and the regular VGPR
operands in a separate pipeline. The splitting would
ensure that the physical registers used for wwm
allocations won't take part in the next allocation
pipeline to avoid any such clobbering.

---

Patch is 1.41 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93526.diff


85 Files Affected:

- (modified) llvm/include/llvm/CodeGen/MachineRegisterInfo.h (+2) 
- (modified) llvm/include/llvm/CodeGen/RegAllocCommon.h (+6-3) 
- (modified) llvm/lib/CodeGen/RegAllocBase.cpp (+1-2) 
- (modified) llvm/lib/CodeGen/RegAllocFast.cpp (+1-2) 
- (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+3-3) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+4) 
- (added) llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp (+104) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+99-9) 
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1) 
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+38-24) 
- (modified) llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp (+123-45) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+21-8) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+14-3) 
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+44-24) 
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.h (+5) 
- (modified) llvm/lib/Target/RISCV/RISCVTargetMachine.cpp (+4-2) 
- (modified) llvm/lib/Target/X86/X86TargetMachine.cpp (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll (+103-116) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+8-7) 
- (modified) llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll (+19-22) 
- (modified) llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll (+77-107) 
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+309-354) 
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+1852-1919) 
- (modified) llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir (+37-43) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir (+26-14) 
- (modified) llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir (+8-4) 
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+44-44) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir (+7-21) 
- (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+260-262) 
- (modified) llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll (+7-30) 
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+22) 
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+277-295) 
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+318-341) 
- (modified) llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll (+14-21) 
- (modified) llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir (+12-28) 
- (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir (+3-19) 
- (modified) llvm/test/CodeGen/AMDGPU/pr51516.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir (-6) 
- (modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (+355-359) 
- (modified) llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+1389-1409) 
- (modified) llvm/test/CodeGen/AMDGPU/remat-vop.mir (+1548-1542) 
- (modified) llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll (+26-4) 
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir (+9-8) 
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir (+153-154) 
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir (+1-3) 
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll (+27-35) 
- (modified) llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll (+6-8) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir (+7-10) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir (+10-12) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir (+3-9) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll (+2-4) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll (+15-22) 
- (modified) llvm/test/CodeGen/AMDGPU/spill192.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/spill224.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/spill288.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/spill320.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/spill352.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/spill384.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+53-100) 
- (modified) llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+21-23) 
- (modified) llvm/test/CodeGen/AMDGPU/true16-ra-pre-gfx11-regression-test.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll (+14-21) 
- (modified) llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir (+7-6) 
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll (+11-12) 
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll (+24-28) 
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+226-245) 
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+210-292) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 09d9a0b4ec402..01d91982ae1c7 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -184,6 +184,8 @@ class MachineRegisterInfo {
       TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
   }
 
+  const MachineFunction &getMF() const { return *MF; }
+
   //===--------------------------------------------------------------------===//
   // Function State
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/CodeGen/RegAllocCommon.h b/llvm/include/llvm/CodeGen/RegAllocCommon.h
index 757ca8e112eec..f3423083eef3a 100644
--- a/llvm/include/llvm/CodeGen/RegAllocCommon.h
+++ b/llvm/include/llvm/CodeGen/RegAllocCommon.h
@@ -10,22 +10,25 @@
 #define LLVM_CODEGEN_REGALLOCCOMMON_H
 
 #include <functional>
+#include <llvm/CodeGen/Register.h>
 
 namespace llvm {
 
 class TargetRegisterClass;
 class TargetRegisterInfo;
+class MachineRegisterInfo;
 
 typedef std::function<bool(const TargetRegisterInfo &TRI,
-                           const TargetRegisterClass &RC)> RegClassFilterFunc;
+                           const MachineRegisterInfo &MRI, const Register Reg)>
+    RegClassFilterFunc;
 
 /// Default register class filter function for register allocation. All virtual
 /// registers should be allocated.
 static inline bool allocateAllRegClasses(const TargetRegisterInfo &,
-                                         const TargetRegisterClass &) {
+                                         const MachineRegisterInfo &,
+                                         const Register) {
   return true;
 }
-
 }
 
 #endif // LLVM_CODEGEN_REGALLOCCOMMON_H
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index d0dec372f6896..a4645ed93029d 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -181,8 +181,7 @@ void RegAllocBase::enqueue(const LiveInterval *LI) {
   if (VRM->hasPhys(Reg))
     return;
 
-  const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
-  if (ShouldAllocateClass(*TRI, RC)) {
+  if (ShouldAllocateClass(*TRI, *MRI, Reg)) {
     LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n');
     enqueueImpl(LI);
   } else {
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 6740e1f0edb4f..f6419daba6a2d 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -417,8 +417,7 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
 
 bool RegAllocFast::shouldAllocateRegister(const Register Reg) const {
   assert(Reg.isVirtual());
-  const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
-  return ShouldAllocateClass(*TRI, RC);
+  return ShouldAllocateClass(*TRI, *MRI, Reg);
 }
 
 void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 348277224c7ae..b3bf1899ceeaf 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2306,9 +2306,9 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
     if (Reg.isPhysical())
       continue;
 
-    // This may be a skipped class
+    // This may be a skipped register.
     if (!VRM->hasPhys(Reg)) {
-      assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) &&
+      assert(!ShouldAllocateClass(*TRI, *MRI, Reg) &&
              "We have an unallocated variable which should have been handled");
       continue;
     }
@@ -2698,7 +2698,7 @@ bool RAGreedy::hasVirtRegAlloc() {
     const TargetRegisterClass *RC = MRI->getRegClass(Reg);
     if (!RC)
       continue;
-    if (ShouldAllocateClass(*TRI, *RC))
+    if (ShouldAllocateClass(*TRI, *MRI, Reg))
       return true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6016bd5187d88..cd9f3fb162fd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -56,6 +56,7 @@ ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPULateCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+FunctionPass *createAMDGPUReserveWWMRegsPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
 ModulePass *
 createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
@@ -149,6 +150,9 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
+extern char &AMDGPUReserveWWMRegsID;
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
new file mode 100644
index 0000000000000..5ed8cd4231d00
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
@@ -0,0 +1,104 @@
+//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to the reserved regs list
+//---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass should be invoked at the end of wwm-regalloc pipeline.
+/// It identifies the WWM regs allocated during this pipeline and add
+/// them to the list of reserved registers so that they won't be available for
+/// per-thread VGPR allocation in the subsequent regalloc pipeline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"
+
+namespace {
+
+class AMDGPUReserveWWMRegs : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
+    initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Reserve WWM Registers";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
+                "AMDGPU Reserve WWM Registers", false, false)
+
+char AMDGPUReserveWWMRegs::ID = 0;
+
+char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;
+
+bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opc = MI.getOpcode();
+      if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
+          Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
+        continue;
+
+      Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
+                         ? MI.getOperand(0).getReg()
+                         : MI.getOperand(1).getReg();
+
+      assert(Reg.isPhysical() &&
+             "All WWM registers should have been allocated by now.");
+
+      MFI->reserveWWMRegister(Reg);
+      Changed |= true;
+    }
+  }
+
+  // Reset the renamable flag for MOs involving wwm-regs to get rid of the MIR
+  // Verifier error.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+
+        Register Reg = MO.getReg();
+        if (Reg.isPhysical() &&
+            llvm::is_contained(MFI->getWWMReservedRegs(), Reg))
+          MO.setIsRenamable(false);
+      }
+    }
+  }
+
+  // Now clear the NonWWMRegMask earlier set during wwm-regalloc.
+  MFI->clearNonWWMRegAllocMask();
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbbfe34a63863..e3375c758b8d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -82,24 +82,44 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
     : RegisterRegAllocBase(N, D, C) {}
 };
 
+class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
+public:
+  WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+      : RegisterRegAllocBase(N, D, C) {}
+};
+
 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
-                              const TargetRegisterClass &RC) {
-  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+                              const MachineRegisterInfo &MRI,
+                              const Register Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
 }
 
 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
-                              const TargetRegisterClass &RC) {
-  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+                              const MachineRegisterInfo &MRI,
+                              const Register Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
 }
 
+static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
+                                const MachineRegisterInfo &MRI,
+                                const Register Reg) {
+  const SIMachineFunctionInfo *MFI =
+      MRI.getMF().getInfo<SIMachineFunctionInfo>();
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
+         MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+}
 
-/// -{sgpr|vgpr}-regalloc=... command line option.
+/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
 
 static SGPRRegisterRegAlloc
 defaultSGPRRegAlloc("default",
@@ -116,6 +136,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
              cl::desc("Register allocator to use for VGPRs"));
 
+static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<WWMRegisterRegAlloc>>
+    WWMRegAlloc("wwm-regalloc", cl::Hidden,
+                cl::init(&useDefaultRegisterAllocator),
+                cl::desc("Register allocator to use for WWM registers"));
 
 static void initializeDefaultSGPRRegisterAllocatorOnce() {
   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
@@ -135,6 +160,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
   }
 }
 
+static void initializeDefaultWWMRegisterAllocatorOnce() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = WWMRegAlloc;
+    WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
+  }
+}
+
 static FunctionPass *createBasicSGPRRegisterAllocator() {
   return createBasicRegisterAllocator(onlyAllocateSGPRs);
 }
@@ -159,6 +193,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
   return createFastRegisterAllocator(onlyAllocateVGPRs, true);
 }
 
+static FunctionPass *createBasicWWMRegisterAllocator() {
+  return createBasicRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createGreedyWWMRegisterAllocator() {
+  return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
+}
+
+static FunctionPass *createFastWWMRegisterAllocator() {
+  return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
+}
+
 static SGPRRegisterRegAlloc basicRegAllocSGPR(
   "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
@@ -175,7 +221,15 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(
 
 static VGPRRegisterRegAlloc fastRegAllocVGPR(
   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
-}
+static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
+                                               "basic register allocator",
+                                               createBasicWWMRegisterAllocator);
+static WWMRegisterRegAlloc
+    greedyRegAllocWWMReg("greedy", "greedy register allocator",
+                         createGreedyWWMRegisterAllocator);
+static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
+                                              createFastWWMRegisterAllocator);
+} // namespace
 
 static cl::opt<bool>
 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
@@ -424,6 +478,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
   initializeAMDGPULowerBufferFatPointersPass(*PR);
+  initializeAMDGPUReserveWWMRegsPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
@@ -923,6 +978,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
 
   FunctionPass *createSGPRAllocPass(bool Optimized);
   FunctionPass *createVGPRAllocPass(bool Optimized);
+  FunctionPass *createWWMRegAllocPass(bool Optimized);
   FunctionPass *createRegAllocPass(bool Optimized) override;
 
   bool addRegAssignAndRewriteFast() override;
@@ -1331,7 +1387,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 }
 
 bool GCNPassConfig::addPreRewrite() {
-  addPass(&SILowerWWMCopiesID);
   if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
   return true;
@@ -1367,12 +1422,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
   return createFastVGPRRegisterAllocator();
 }
 
+FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
+  // Initialize the global default.
+  llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
+                  initializeDefaultWWMRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  if (Optimized)
+    return createGreedyWWMRegisterAllocator();
+
+  return createFastWWMRegisterAllocator();
+}
+
 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
   llvm_unreachable("should not be used");
 }
 
 static const char RegAllocOptNotSupportedMessage[] =
-  "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
+    "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
+    "and -vgpr-regalloc";
 
 bool GCNPassConfig::addRegAssignAndRewriteFast() {
   if (!usingDefaultRegAlloc())
@@ -1384,11 +1455,20 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
 
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsID);
+
+  // To Allocate wwm registers used in whole quad mode operations (for pixel
+  // shaders).
   addPass(&SIPreAllocateWWMRegsID);
 
-  addPass(createVGPRAllocPass(false));
+  // For allocating other wwm register operands.
+  addPass(createWWMRegAllocPass(false));
 
   addPass(&SILowerWWMCopiesID);
+  addPass(&AMDGPUReserveWWMRegsID);
+
+  // For allocating per-thread VGPRs.
+  addPass(createVGPRAllocPass(false));
+
   return true;
 }
 
@@ -1408,8 +1488,18 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
 
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsID);
+
+  // To Allocate wwm registers used in whole quad mode operations (for pixel
+  // shaders).
   addPass(&SIPreAllocateWWMRegsID);
 
+  // For allocating other whole wave mode registers.
+  addPass(createWWMRegAllocPass(true));
+  addPass(&SILowerWWMCopiesID);
+  addPass(createVirtRegRewriter(false));
+  addPass(&AMDGPUReserveWWMRegsID);
+
+  // For allocating per-thread VGPRs.
   addPass(createVGPRAllocPass(true));
 
   addPreRewrite();
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c992352cb78da..178af07048571 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -94,6 +94,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPURegBankSelect.cpp
   AMDGPURegisterBankInfo.cpp
   AMDGPURemoveIncompatibleFunctions.cpp
+  AMDGPUReserveWWMRegs.cpp
   AMDGPUResourceUsageAnalysis.cpp
   AMDGPURewriteOutArguments.cpp
   AMDGPURewriteUndefForPHI.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index eae666ab0e7d7..fbb6c3d9fe24b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1555,6 +1555,17 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
   }
 }
 
+// Mark all WWM VGPRs as BB LiveIns.
+static void addWwmRegBBLiveIn(MachineFunction &MF) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto &Reg : MFI->getWWMReservedRegs())
+      MBB.addLiveIn(Reg);
+
+    MBB.sortUniqueLiveIns();
+  }
+}
+
 // Only report VGPRs to generic code.
 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                            BitVector &SavedVGPRs,
@@ -1567,11 +1578,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
     return;
 
-  MFI->shiftSpillPhysVGPRsToLowestRange(MF);
-
   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
-  if (MFI->isEntryFunction())
-    return;
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1581,19 +1588,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   MachineInstr *ReturnMI = nullptr;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      // WRITELANE instructions used for SGPR spills can overwrite the inactive
-      // lanes of VGPRs and callee must spill and restore them even if they are
-      // marked Caller-saved.
-
-      // TODO: Handle this elsewhere at an early point. Walking through all MBBs
-      // here would be a bad heuristic. A better way should be by calling
-      // allocateWWMSpill during the regalloc pipeline whenever a physical
-      // register is allocated for the intended virtual registers.
-      if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
-        MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
-      else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
-        MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
-      else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
+      // TODO: Walking through all MBBs here would be a bad heuristic. Better
+      // handle them elsewhere.
+      if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
         NeedExecCopyReservedReg = true;
       else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
                MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
@@ -1608,6 +1605,25 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
   }
 
+  SmallVector<Register> SortedWWMVGPRs;
+  for (auto &Reg : MFI->getWWMReservedRegs()) {
+    // The shift-back is needed only for the VGPRs used for SGPR spills and they
+    // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
+    // reserved registers.
+    const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+    if (TRI->getRegSizeInBits(*RC) > 32)
+      continue;
+    SortedWWMVGPRs.push_back(Reg);
+...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/93526