[llvm] [AMDGPU] Verify dominance when rewriting spills to registers (PR #167347)

Sun Nov 30 21:00:06 PST 2025

https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/167347

>From 5eb1e66c1cb70721a9082e80a310a8211e8b036f Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Mon, 3 Nov 2025 22:12:39 -0800
Subject: [PATCH] [AMDGPU] Verify dominance when rewriting spills to registers

Rev1: Updated condition to check for "joint domination", i.e. no reload
is reachable from entry without reaching a store to the same slot. Still
working on reduced test or unit test.

When performing spill elimination in the AGPR copy rewrite pass it was
possible to see spill reloads that were not jointly dominated by any
store. This caused invalid MIR to be generated where vreg uses were not
dominated by defs. This patch adds a joint dominance check before
rewriting spills.
---
 .../AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp      | 114 +++++++++-
 .../Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h |  32 +++
 .../AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp  | 196 ++++++++++++++++++
 llvm/unittests/Target/AMDGPU/CMakeLists.txt   |   1 +
 4 files changed, 339 insertions(+), 4 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h
 create mode 100644 llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 89c16dadb4b41..d5d3787d51c8c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -22,6 +22,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPURewriteAGPRCopyMFMA.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
@@ -30,6 +31,7 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -39,6 +41,72 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-rewrite-agpr-copy-mfma"
 
+namespace llvm {
+namespace AMDGPU {
+bool checkAGPRCopyMFMAJointDominance(
+    const MachineFunction &MF, const MachineDominatorTree &MDT,
+    const SmallVectorImpl<MachineInstr *> &Stores,
+    const SmallVectorImpl<MachineInstr *> &Loads, int Slot) {
+  SmallPtrSet<MachineBasicBlock *, 4> StoreBlocks;
+  for (MachineInstr *S : Stores)
+    if (MDT.isReachableFromEntry(S->getParent()))
+      StoreBlocks.insert(S->getParent());
+
+  if (StoreBlocks.empty())
+    return false;
+
+  // Compute blocks reachable from entry without passing through a store
+  // block.
+  SmallPtrSet<MachineBasicBlock *, 16> StoreFreeReachable;
+  SmallVector<MachineBasicBlock *, 16> Worklist;
+
+  MachineBasicBlock &EntryMBB = const_cast<MachineBasicBlock &>(MF.front());
+  Worklist.push_back(&EntryMBB);
+  StoreFreeReachable.insert(&EntryMBB);
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+    if (StoreBlocks.contains(MBB))
+      continue;
+
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (StoreFreeReachable.insert(Succ).second)
+        Worklist.push_back(Succ);
+    }
+  }
+
+  auto IsLoadJointlyDominatedByStores = [&](MachineInstr *LoadMI) -> bool {
+    MachineBasicBlock *LoadMBB = LoadMI->getParent();
+    if (!MDT.isReachableFromEntry(LoadMBB))
+      return true;
+
+    // Check if every path passed through a store block.
+    if (!StoreFreeReachable.contains(LoadMBB))
+      return true;
+
+    // Otherwise, there exists a path to this block that has not seen any
+    // store yet. We must ensure that within this block there is a store to
+    // this slot before the load.
+    for (MachineInstr &MI : *LoadMBB) {
+      if (&MI == LoadMI)
+        break;
+      if (MI.mayStore()) {
+        for (MachineOperand &MO : MI.operands()) {
+          if (MO.isFI() && MO.getIndex() == Slot)
+            return true;
+        }
+      }
+    }
+
+    return false;
+  };
+
+  return llvm::all_of(Loads, IsLoadJointlyDominatedByStores);
+}
+
+} // namespace AMDGPU
+} // namespace llvm
+
 namespace {
 
 STATISTIC(NumMFMAsRewrittenToAGPR,
@@ -58,6 +126,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   LiveIntervals &LIS;
   LiveStacks &LSS;
   const RegisterClassInfo &RegClassInfo;
+  MachineDominatorTree &MDT;
 
   bool attemptReassignmentsToAGPR(SmallSetVector<Register, 4> &InterferingRegs,
                                   MCPhysReg PrefPhysReg) const;
@@ -66,10 +135,11 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
                                 LiveRegMatrix &LRM, LiveIntervals &LIS,
                                 LiveStacks &LSS,
-                                const RegisterClassInfo &RegClassInfo)
+                                const RegisterClassInfo &RegClassInfo,
+                                MachineDominatorTree &MDT)
       : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
         TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
-        LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo) {}
+        LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo), MDT(MDT) {}
 
   bool isRewriteCandidate(const MachineInstr &MI) const {
     return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
@@ -515,6 +585,37 @@ void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const {
     if (SpillReferences == SpillSlotReferences.end())
       continue;
 
+    // For each spill reload, every path from entry to the reload must pass
+    // through at least one spill store to the same stack slot.
+    SmallVector<MachineInstr *, 4> Stores, Loads;
+    Stores.reserve(SpillReferences->second.size());
+    Loads.reserve(SpillReferences->second.size());
+    for (MachineInstr *MI : SpillReferences->second) {
+      if (MI->mayStore())
+        Stores.push_back(MI);
+      else if (MI->mayLoad())
+        Loads.push_back(MI);
+    }
+
+    SmallPtrSet<MachineBasicBlock *, 4> StoreBlocks;
+    for (MachineInstr *S : Stores)
+      if (MDT.isReachableFromEntry(S->getParent()))
+        StoreBlocks.insert(S->getParent());
+
+    if (StoreBlocks.empty()) {
+      LLVM_DEBUG(dbgs() << "Skipping " << printReg(Slot, &TRI)
+                        << ": no reachable stores\n");
+      continue;
+    }
+
+    if (!AMDGPU::checkAGPRCopyMFMAJointDominance(MF, MDT, Stores, Loads,
+                                                 Slot)) {
+      LLVM_DEBUG(
+          dbgs() << "Skipping " << printReg(Slot, &TRI)
+                 << ": some reachable load not jointly dominated by stores\n");
+      continue;
+    }
+
     const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
 
     LLVM_DEBUG(dbgs() << "Trying to eliminate " << printReg(Slot, &TRI)
@@ -603,11 +704,13 @@ class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass {
     AU.addRequired<VirtRegMapWrapperLegacy>();
     AU.addRequired<LiveRegMatrixWrapperLegacy>();
     AU.addRequired<LiveStacksWrapperLegacy>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
 
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addPreserved<VirtRegMapWrapperLegacy>();
     AU.addPreserved<LiveRegMatrixWrapperLegacy>();
     AU.addPreserved<LiveStacksWrapperLegacy>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
 
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -622,6 +725,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE,
                     "AMDGPU Rewrite AGPR-Copy-MFMA", false, false)
 
@@ -641,7 +745,8 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction(
   auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
   auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT);
   return Impl.run(MF);
 }
 
@@ -652,10 +757,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
   LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
   LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
   LiveStacks &LSS = MFAM.getResult<LiveStacksAnalysis>(MF);
+  MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
   RegisterClassInfo RegClassInfo;
   RegClassInfo.runOnMachineFunction(MF);
 
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT);
   if (!Impl.run(MF))
     return PreservedAnalyses::all();
   auto PA = getMachineFunctionPassPreservedAnalyses();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h
new file mode 100644
index 0000000000000..624860f8cb978
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h
@@ -0,0 +1,32 @@
+//===- AMDGPURewriteAGPRCopyMFMA.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class MachineFunction;
+class MachineDominatorTree;
+class MachineInstr;
+
+namespace AMDGPU {
+
+/// Returns true if every reload in \p Loads is jointly dominated by at least
+/// one store in \p Stores to the same stack slot \p Slot.
+bool checkAGPRCopyMFMAJointDominance(
+    const MachineFunction &MF, const MachineDominatorTree &MDT,
+    const SmallVectorImpl<MachineInstr *> &Stores,
+    const SmallVectorImpl<MachineInstr *> &Loads, int Slot);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp b/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp
new file mode 100644
index 0000000000000..5e2ca64224146
--- /dev/null
+++ b/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp
@@ -0,0 +1,196 @@
+#include "AMDGPURewriteAGPRCopyMFMA.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPUUnitTests.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPURewriteAGPRCopyMFMATest : public testing::Test {
+protected:
+  std::unique_ptr<LLVMContext> Context;
+  std::unique_ptr<Module> M;
+  std::unique_ptr<const TargetMachine> TM;
+  std::unique_ptr<MachineModuleInfo> MMI;
+  std::unique_ptr<MachineFunction> MF;
+  std::unique_ptr<MachineDominatorTree> MDT;
+
+  void SetUp() override {
+    TM = createAMDGPUTargetMachine("amdgcn-amd-amdhsa", "gfx90a", "");
+    if (!TM)
+      return;
+
+    Context = std::make_unique<LLVMContext>();
+    M = std::make_unique<Module>("TestModule", *Context);
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+  }
+
+  void createMachineFunction() {
+    M->setDataLayout(TM->createDataLayout());
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Context), false);
+    Function *F = Function::Create(FTy, Function::ExternalLinkage, "test", *M);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+                                           MMI->getContext(), 0);
+  }
+};
+
+TEST_F(AMDGPURewriteAGPRCopyMFMATest, JointDominanceDiamond) {
+  if (!TM)
+    return;
+  createMachineFunction();
+
+  // Create CFG:
+  // Entry -> StoreBB
+  // Entry -> NoStoreBB
+  // StoreBB -> MergeBB
+  // NoStoreBB -> MergeBB
+
+  MachineBasicBlock *Entry = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *StoreBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NoStoreBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *MergeBB = MF->CreateMachineBasicBlock();
+
+  MF->push_back(Entry);
+  MF->push_back(StoreBB);
+  MF->push_back(NoStoreBB);
+  MF->push_back(MergeBB);
+
+  Entry->addSuccessor(StoreBB);
+  Entry->addSuccessor(NoStoreBB);
+
+  StoreBB->addSuccessor(MergeBB);
+
+  NoStoreBB->addSuccessor(MergeBB);
+
+  MDT = std::make_unique<MachineDominatorTree>();
+  MDT->recalculate(*MF);
+
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+
+  MachineInstr *StoreMI =
+      BuildMI(StoreBB, DL, TII->get(TargetOpcode::INLINEASM))
+          .addExternalSymbol("store")
+          .addImm(1 /* SideEffects */);
+
+  MachineInstr *LoadMI = BuildMI(MergeBB, DL, TII->get(TargetOpcode::INLINEASM))
+                             .addExternalSymbol("load")
+                             .addImm(1);
+
+  int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4));
+  StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+  LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+
+  SmallVector<MachineInstr *, 4> Stores = {StoreMI};
+  SmallVector<MachineInstr *, 4> Loads = {LoadMI};
+
+  bool Result =
+      AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot);
+  EXPECT_FALSE(Result);
+}
+
+TEST_F(AMDGPURewriteAGPRCopyMFMATest, LoadBeforeStoreInLoop) {
+  if (!TM)
+    return;
+  createMachineFunction();
+
+  // Entry -> LoopBB -> Exit
+  // LoopBB -> LoopBB
+
+  MachineBasicBlock *Entry = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *Exit = MF->CreateMachineBasicBlock();
+
+  MF->push_back(Entry);
+  MF->push_back(LoopBB);
+  MF->push_back(Exit);
+
+  Entry->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(Exit);
+
+  MDT = std::make_unique<MachineDominatorTree>();
+  MDT->recalculate(*MF);
+
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+
+  MachineInstr *LoadMI = BuildMI(LoopBB, DL, TII->get(TargetOpcode::INLINEASM))
+                             .addExternalSymbol("load")
+                             .addImm(1);
+
+  MachineInstr *StoreMI = BuildMI(LoopBB, DL, TII->get(TargetOpcode::INLINEASM))
+                              .addExternalSymbol("store")
+                              .addImm(1);
+
+  int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4));
+  StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+  LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+
+  SmallVector<MachineInstr *, 4> Stores = {StoreMI};
+  SmallVector<MachineInstr *, 4> Loads = {LoadMI};
+
+  bool Result =
+      AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot);
+  EXPECT_FALSE(Result);
+}
+
+TEST_F(AMDGPURewriteAGPRCopyMFMATest, DominatedByPredecessor) {
+  if (!TM)
+    return;
+  createMachineFunction();
+
+  // Entry -> StoreBB -> LoadBB
+
+  MachineBasicBlock *Entry = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *StoreBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *LoadBB = MF->CreateMachineBasicBlock();
+
+  MF->push_back(Entry);
+  MF->push_back(StoreBB);
+  MF->push_back(LoadBB);
+
+  Entry->addSuccessor(StoreBB);
+  StoreBB->addSuccessor(LoadBB);
+
+  MDT = std::make_unique<MachineDominatorTree>();
+  MDT->recalculate(*MF);
+
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+
+  MachineInstr *StoreMI =
+      BuildMI(StoreBB, DL, TII->get(TargetOpcode::INLINEASM))
+          .addExternalSymbol("store")
+          .addImm(1);
+
+  MachineInstr *LoadMI = BuildMI(LoadBB, DL, TII->get(TargetOpcode::INLINEASM))
+                             .addExternalSymbol("load")
+                             .addImm(1);
+
+  int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4));
+  StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+  LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+
+  SmallVector<MachineInstr *, 4> Stores = {StoreMI};
+  SmallVector<MachineInstr *, 4> Loads = {LoadMI};
+
+  bool Result =
+      AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot);
+  EXPECT_TRUE(Result);
+}
+
+} // namespace
diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
index d6cbaf3f3fb5d..57476db34d1f6 100644
--- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
@@ -25,4 +25,5 @@ add_llvm_target_unittest(AMDGPUTests
   ExecMayBeModifiedBeforeAnyUse.cpp
   LiveRegUnits.cpp
   PALMetadata.cpp
+  AMDGPURewriteAGPRCopyMFMATest.cpp
   )