[llvm] Move SI Lower Control Flow Up (PR #159557)

Patrick Simmons via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 18 04:36:30 PDT 2025


https://github.com/linuxrocks123 created https://github.com/llvm/llvm-project/pull/159557

Still needs tests and testing.  Also needs camel case.  If the snake case bothers anyone, I'll fix it now.


>From 5497afab88cc7b02964a3a2ae3cd29523cb29d31 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Mon, 15 Sep 2025 08:35:00 -0500
Subject: [PATCH 1/7] In-progress work

---
 .../lib/Target/AMDGPU/SICustomBranchBundles.h | 144 ++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/SICustomBranchBundles.h

diff --git a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
new file mode 100644
index 0000000000000..fc96c4370879e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include "SIInstrInfo.h"
+
+#include <cassert>
+#include <unordered_set>
+
+using namespace llvm;
+
+using std::unordered_set;
+using std::vector;
+
+static inline MachineInstr &get_branch_with_dest(MachineBasicBlock &branching_MBB,
+                                                 MachineBasicBlock &dest_MBB) {
+  auto& TII = *branching_MBB.getParent()->getSubtarget<GCNSubtarget>().getInstrInfo();
+  for (MachineInstr &branch_MI : reverse(branching_MBB.instrs()))
+    if (branch_MI.isBranch() && TII.getBranchDestBlock(branch_MI) == &dest_MBB)
+      return branch_MI;
+
+  llvm_unreachable("Don't call this if there's no branch to the destination.");
+}
+
+static inline void move_ins_before_phis(MachineInstr &MI,
+                                            MachineBasicBlock &MBB) {
+  MachineFunction& MF = *MBB.getParent();
+  auto& TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+
+  if (MBB.size() == 0 || MBB.front().getOpcode() != AMDGPU::PHI /*do we need to check for debug instructions here?*/)
+    MBB.insert(MBB.begin(), &MI);
+  else
+    for (auto* pred_MBB : MBB.predecessors())
+    {
+      MachineInstr& branch_MI = get_branch_with_dest(*pred_MBB,MBB);
+      if (branch_MI.isBranch() && TII.getBranchDestBlock(branch_MI) == &MBB) {
+        MachineInstr* cloned_MI = MF.CloneMachineInstr(&MI);
+        pred_MBB->insertAfterBundle(branch_MI.getIterator(), cloned_MI);
+        cloned_MI->bundleWithPred();
+      }
+    }
+
+  MI.eraseFromParent();
+}
+
+static inline MachineBasicBlock::instr_iterator
+get_epilog_for_successor(MachineBasicBlock& pred_MBB, MachineBasicBlock& succ_MBB) {
+  MachineFunction& MF = *pred_MBB.getParent();
+  auto& TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+
+  for (MachineInstr &branch_MI : reverse(pred_MBB.instrs()))
+    if (branch_MI.isBranch() && TII.getBranchDestBlock(branch_MI) == &succ_MBB)
+      return std::next(branch_MI.getIterator());
+    else
+      assert(branch_MI.isBranch() && "Shouldn't have fall-throughs here.");
+
+  llvm_unreachable("There should always be a branch to succ_MBB.");
+}
+
+static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
+  auto& TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+
+  struct CFG_Rewrite_Entry {
+    unordered_set<MachineBasicBlock *> pred_MBBs;
+    MachineBasicBlock *succ_MBB;
+    vector<MachineInstr*> body;
+  };
+
+  auto epilogs_are_identical = [](const vector<MachineInstr *> left,
+                                  const vector<MachineInstr *> right) {
+    if (left.size() != right.size())
+      return false;
+
+    for (unsigned i = 0; i < left.size(); i++)
+      if (!left[i]->isIdenticalTo(*right[i]))
+        return false;
+    return true;
+  };
+
+  auto move_body = [](vector<MachineInstr *> &body,
+                      MachineBasicBlock &dest_MBB) {
+    for (auto rev_it = body.rbegin(); rev_it != body.rend(); rev_it++) {
+      MachineInstr &body_ins = **rev_it;
+      body_ins.removeFromBundle();
+      dest_MBB.insert(dest_MBB.begin(), &body_ins);
+    }
+  };
+  
+  vector<CFG_Rewrite_Entry> cfg_rewrite_entries;
+  for (MachineBasicBlock &MBB : MF) {
+    CFG_Rewrite_Entry to_insert = {{}, &MBB, {}};
+    for (MachineBasicBlock *pred_MBB : MBB.predecessors()) {
+      MachineBasicBlock::instr_iterator ep_it =
+          get_epilog_for_successor(*pred_MBB, MBB)->getIterator();
+
+      vector<MachineInstr *> epilog;
+      while (!ep_it.isEnd())
+        epilog.push_back(&*ep_it++);
+
+      if (!epilogs_are_identical(to_insert.body, epilog)) {
+        if (to_insert.pred_MBBs.size() && to_insert.body.size()) {
+          // Potentially, we need to insert a new entry.  But first see if we
+          // can find an existing entry with the same epilog.
+          bool existing_entry_found = false;
+          for (auto rev_it = cfg_rewrite_entries.rbegin();
+               rev_it != cfg_rewrite_entries.rend() && rev_it->succ_MBB == &MBB;
+               rev_it++)
+            if (epilogs_are_identical(rev_it->body, epilog)) {
+              rev_it->pred_MBBs.insert(pred_MBB);
+              existing_entry_found = true;
+              break;
+            }
+
+          if(!existing_entry_found)
+            cfg_rewrite_entries.push_back(to_insert);
+        }
+        to_insert.pred_MBBs.clear();
+        to_insert.body = epilog;
+      }
+      
+      to_insert.pred_MBBs.insert(pred_MBB);
+    }
+
+    // Handle the last potential rewrite entry.  Lower instead of journaling a
+    // rewrite entry if all predecessor MBBs are in this single entry.
+    if (to_insert.pred_MBBs.size() == MBB.pred_size())
+      // Lower
+      move_body(to_insert.body,MBB);
+    else if (to_insert.body.size())
+      cfg_rewrite_entries.push_back(to_insert);
+
+  // Perform the journaled rewrites.
+  for (const auto &entry : cfg_rewrite_entries) {
+    MachineBasicBlock *mezzanine_MBB = MF.CreateMachineBasicBlock();
+    for(MachineBasicBlock* pred_MBB : entry.pred_MBBs)
+    {
+      
+    }
+  }
+}

>From 38909c5c056f4f98485a14c67ccfc201b1b25515 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 16 Sep 2025 05:01:32 -0500
Subject: [PATCH 2/7] Finish initial implementation ... I hope.

---
 .../lib/Target/AMDGPU/SICustomBranchBundles.h | 41 ++++++++++++++-----
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp |  3 ++
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
index fc96c4370879e..81872f4fe8ab4 100644
--- a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
+++ b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
@@ -26,14 +26,17 @@ static inline MachineInstr &get_branch_with_dest(MachineBasicBlock &branching_MB
   llvm_unreachable("Don't call this if there's no branch to the destination.");
 }
 
-static inline void move_ins_before_phis(MachineInstr &MI,
-                                            MachineBasicBlock &MBB) {
+static inline void move_ins_before_phis(MachineInstr &MI) {
+  MachineBasicBlock& MBB = *MI.getParent();
   MachineFunction& MF = *MBB.getParent();
   auto& TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
 
-  if (MBB.size() == 0 || MBB.front().getOpcode() != AMDGPU::PHI /*do we need to check for debug instructions here?*/)
+  if (MBB.size() == 0 ||
+      MBB.front().getOpcode() !=
+          AMDGPU::PHI /*do we need to check for debug instructions here?*/) {
+    MI.removeFromParent();
     MBB.insert(MBB.begin(), &MI);
-  else
+  } else {
     for (auto* pred_MBB : MBB.predecessors())
     {
       MachineInstr& branch_MI = get_branch_with_dest(*pred_MBB,MBB);
@@ -43,8 +46,8 @@ static inline void move_ins_before_phis(MachineInstr &MI,
         cloned_MI->bundleWithPred();
       }
     }
-
-  MI.eraseFromParent();
+    MI.eraseFromParent();
+  }
 }
 
 static inline MachineBasicBlock::instr_iterator
@@ -132,13 +135,31 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
       move_body(to_insert.body,MBB);
     else if (to_insert.body.size())
       cfg_rewrite_entries.push_back(to_insert);
+  }
 
   // Perform the journaled rewrites.
-  for (const auto &entry : cfg_rewrite_entries) {
+  for (auto &entry : cfg_rewrite_entries) {
     MachineBasicBlock *mezzanine_MBB = MF.CreateMachineBasicBlock();
-    for(MachineBasicBlock* pred_MBB : entry.pred_MBBs)
-    {
-      
+
+    // Deal with mezzanine to successor succession.
+    BuildMI(mezzanine_MBB, DebugLoc(), TII.get(AMDGPU::S_BRANCH)).addMBB(entry.succ_MBB);
+    mezzanine_MBB->addSuccessor(entry.succ_MBB);
+
+    // Move instructions to mezzanine block.
+    move_body(entry.body, *mezzanine_MBB);
+
+    for (MachineBasicBlock *pred_MBB : entry.pred_MBBs) {
+      //Deal with predecessor to mezzanine succession.
+      MachineInstr &branch_ins =
+          get_branch_with_dest(*pred_MBB, *entry.succ_MBB);
+      assert(branch_ins.getOperand(0).isMBB() && "Branch instruction isn't.");
+      branch_ins.getOperand(0).setMBB(mezzanine_MBB);
+      pred_MBB->replaceSuccessor(entry.succ_MBB, mezzanine_MBB);
+
+      // Delete instructions that were lowered from epilog
+      auto epilog_it = get_epilog_for_successor(*pred_MBB, *entry.succ_MBB);
+      while (!epilog_it.isEnd())
+        epilog_it++->eraseFromBundle();
     }
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 115a020f44098..ee4d6709eeafc 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -48,6 +48,7 @@
 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
 //===----------------------------------------------------------------------===//
 
+#include "SICustomBranchBundles.h"
 #include "SILowerControlFlow.h"
 #include "AMDGPU.h"
 #include "AMDGPULaneMaskUtils.h"
@@ -323,6 +324,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   if (LV)
     LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);
 
+  move_ins_before_phis(*OrSaveExec);
+
   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
 
   MachineBasicBlock::iterator ElsePt(MI);

>From 15da6ccc03e2dbf6889f26fe71c57405a7a7c404 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 16 Sep 2025 05:39:36 -0500
Subject: [PATCH 3/7] Now to write the pass to get rid of the bundles

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 +++++---
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp  | 8 ++++++++
 llvm/lib/Target/AMDGPU/SILowerControlFlow.h    | 8 ++++++++
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b5771b6..fa454730364c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1563,7 +1563,7 @@ void GCNPassConfig::addFastRegAlloc() {
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
+  //insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
 
@@ -1586,10 +1586,12 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (OptVGPRLiveRange)
     insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID);
 
+  insertPass(&SIOptimizeVGPRLiveRangeLegacyID, &SILowerControlFlowLegacyID);
+
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
+  //insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
@@ -2256,7 +2258,7 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass<PHIEliminationPass>(SILowerControlFlowPass());
+  //insertPass<PHIEliminationPass>(SILowerControlFlowPass());
 
   if (EnableRewritePartialRegUses)
     insertPass<RenameIndependentSubregsPass>(GCNRewritePartialRegUsesPass());
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index ee4d6709eeafc..ea3d2ed61a808 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -153,6 +153,14 @@ class SILowerControlFlowLegacy : public MachineFunctionPass {
     return "SI Lower control flow pseudo instructions";
   }
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().setIsSSA();
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().setNoPHIs();
+  }
+  
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
     // Should preserve the same set that TwoAddressInstructions does.
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.h b/llvm/lib/Target/AMDGPU/SILowerControlFlow.h
index 23803c679c246..0f4df79952999 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.h
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.h
@@ -16,6 +16,14 @@ class SILowerControlFlowPass : public PassInfoMixin<SILowerControlFlowPass> {
 public:
   PreservedAnalyses run(MachineFunction &MF,
                         MachineFunctionAnalysisManager &MFAM);
+
+  MachineFunctionProperties getRequiredProperties() const {
+    return MachineFunctionProperties().setIsSSA();
+  }
+
+  MachineFunctionProperties getClearedProperties() const {
+    return MachineFunctionProperties().setNoPHIs();
+  }
 };
 } // namespace llvm
 

>From c7439fec4a567543a4132a85c42150ba6113bcf3 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 16 Sep 2025 06:07:26 -0500
Subject: [PATCH 4/7] For my next act, I'll actually make this SSA form by
 creating different VREGs in the hoisted instructions and merging them with
 PHI.

---
 llvm/lib/Target/AMDGPU/SICustomBranchBundles.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
index 81872f4fe8ab4..51d1e84a47c4c 100644
--- a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
+++ b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
@@ -31,9 +31,14 @@ static inline void move_ins_before_phis(MachineInstr &MI) {
   MachineFunction& MF = *MBB.getParent();
   auto& TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
 
-  if (MBB.size() == 0 ||
-      MBB.front().getOpcode() !=
-          AMDGPU::PHI /*do we need to check for debug instructions here?*/) {
+  bool phi_seen = false;
+  for (MachineInstr &maybe_phi : MBB)
+    if (maybe_phi.getOpcode() == AMDGPU::PHI) {
+      phi_seen = true;
+      break;
+    }
+  
+  if (!phi_seen) {
     MI.removeFromParent();
     MBB.insert(MBB.begin(), &MI);
   } else {

>From deb796553adbb6d03e1a0cae50f299bfd1e417dc Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 17 Sep 2025 06:36:07 -0500
Subject: [PATCH 5/7] More work

---
 .../lib/Target/AMDGPU/SICustomBranchBundles.h | 136 +++++++++++++-----
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp |   6 +
 2 files changed, 109 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
index 51d1e84a47c4c..18888975bdba0 100644
--- a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
+++ b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
@@ -4,6 +4,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Support/ErrorHandling.h"
 
 #include "SIInstrInfo.h"
@@ -29,11 +30,13 @@ static inline MachineInstr &get_branch_with_dest(MachineBasicBlock &branching_MB
 static inline void move_ins_before_phis(MachineInstr &MI) {
   MachineBasicBlock& MBB = *MI.getParent();
   MachineFunction& MF = *MBB.getParent();
-  auto& TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+  auto &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+  auto& MRI = MF.getRegInfo();
 
   bool phi_seen = false;
-  for (MachineInstr &maybe_phi : MBB)
-    if (maybe_phi.getOpcode() == AMDGPU::PHI) {
+  MachineBasicBlock::iterator first_phi;
+  for (first_phi = MBB.begin(); first_phi != MBB.end(); first_phi++)
+    if (first_phi->getOpcode() == AMDGPU::PHI) {
       phi_seen = true;
       break;
     }
@@ -42,20 +45,46 @@ static inline void move_ins_before_phis(MachineInstr &MI) {
     MI.removeFromParent();
     MBB.insert(MBB.begin(), &MI);
   } else {
-    for (auto* pred_MBB : MBB.predecessors())
-    {
+    auto phi = BuildMI(MBB, first_phi, MI.getDebugLoc(), TII.get(AMDGPU::PHI),
+            MI.getOperand(0).getReg());
+    for (auto *pred_MBB : MBB.predecessors()) {
+      Register cloned_reg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
       MachineInstr& branch_MI = get_branch_with_dest(*pred_MBB,MBB);
-      if (branch_MI.isBranch() && TII.getBranchDestBlock(branch_MI) == &MBB) {
-        MachineInstr* cloned_MI = MF.CloneMachineInstr(&MI);
-        pred_MBB->insertAfterBundle(branch_MI.getIterator(), cloned_MI);
-        cloned_MI->bundleWithPred();
-      }
+      MachineInstr *cloned_MI = MF.CloneMachineInstr(&MI);
+      cloned_MI->getOperand(0).setReg(cloned_reg);
+      phi.addReg(cloned_reg).addMBB(pred_MBB);
+      pred_MBB->insertAfterBundle(branch_MI.getIterator(), cloned_MI);
+      cloned_MI->bundleWithPred();
     }
     MI.eraseFromParent();
   }
 }
 
-static inline MachineBasicBlock::instr_iterator
+struct Epilog_Iterator {
+  MachineBasicBlock::instr_iterator internal_it;
+  Epilog_Iterator(MachineBasicBlock::instr_iterator i) : internal_it(i) {}
+
+  bool operator==(const Epilog_Iterator &other) {
+    return internal_it == other.internal_it;
+  }
+  bool isEnd() { return internal_it.isEnd(); }
+  MachineInstr &operator*() { return *internal_it; }
+  MachineBasicBlock::instr_iterator operator->() { return internal_it; }
+  Epilog_Iterator &operator++() {
+    ++internal_it;
+    if (!internal_it.isEnd() && internal_it->isBranch())
+      internal_it = internal_it->getParent()->instr_end();
+    return *this;
+  }
+  Epilog_Iterator operator++(int ignored) {
+    Epilog_Iterator to_return = *this;
+    ++*this;
+    return to_return;
+  }
+  
+};
+
+static inline Epilog_Iterator
 get_epilog_for_successor(MachineBasicBlock& pred_MBB, MachineBasicBlock& succ_MBB) {
   MachineFunction& MF = *pred_MBB.getParent();
   auto& TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
@@ -69,6 +98,27 @@ get_epilog_for_successor(MachineBasicBlock& pred_MBB, MachineBasicBlock& succ_MB
   llvm_unreachable("There should always be a branch to succ_MBB.");
 }
 
+static inline bool epilogs_are_identical(const vector<MachineInstr *> left,
+                                         const vector<MachineInstr *> right,
+                                         const MachineBasicBlock &succ_MBB) {
+  if (left.size() != right.size())
+    return false;
+  
+  for (unsigned i = 0; i < left.size(); i++)
+    if (!left[i]->isIdenticalTo(*right[i]))
+      return false;
+  return true;
+}
+
+static inline void move_body(vector<MachineInstr *> &body,
+                      MachineBasicBlock &dest_MBB) {
+  for (auto rev_it = body.rbegin(); rev_it != body.rend(); rev_it++) {
+    MachineInstr &body_ins = **rev_it;
+    body_ins.removeFromBundle();
+    dest_MBB.insert(dest_MBB.begin(), &body_ins);
+  }
+}
+
 static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
   auto& TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
 
@@ -78,26 +128,6 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
     vector<MachineInstr*> body;
   };
 
-  auto epilogs_are_identical = [](const vector<MachineInstr *> left,
-                                  const vector<MachineInstr *> right) {
-    if (left.size() != right.size())
-      return false;
-
-    for (unsigned i = 0; i < left.size(); i++)
-      if (!left[i]->isIdenticalTo(*right[i]))
-        return false;
-    return true;
-  };
-
-  auto move_body = [](vector<MachineInstr *> &body,
-                      MachineBasicBlock &dest_MBB) {
-    for (auto rev_it = body.rbegin(); rev_it != body.rend(); rev_it++) {
-      MachineInstr &body_ins = **rev_it;
-      body_ins.removeFromBundle();
-      dest_MBB.insert(dest_MBB.begin(), &body_ins);
-    }
-  };
-  
   vector<CFG_Rewrite_Entry> cfg_rewrite_entries;
   for (MachineBasicBlock &MBB : MF) {
     CFG_Rewrite_Entry to_insert = {{}, &MBB, {}};
@@ -109,7 +139,7 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
       while (!ep_it.isEnd())
         epilog.push_back(&*ep_it++);
 
-      if (!epilogs_are_identical(to_insert.body, epilog)) {
+      if (!epilogs_are_identical(to_insert.body, epilog, MBB)) {
         if (to_insert.pred_MBBs.size() && to_insert.body.size()) {
           // Potentially, we need to insert a new entry.  But first see if we
           // can find an existing entry with the same epilog.
@@ -117,7 +147,7 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
           for (auto rev_it = cfg_rewrite_entries.rbegin();
                rev_it != cfg_rewrite_entries.rend() && rev_it->succ_MBB == &MBB;
                rev_it++)
-            if (epilogs_are_identical(rev_it->body, epilog)) {
+            if (epilogs_are_identical(rev_it->body, epilog, MBB)) {
               rev_it->pred_MBBs.insert(pred_MBB);
               existing_entry_found = true;
               break;
@@ -168,3 +198,43 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
     }
   }
 }
+
+namespace std {
+  template <>
+  struct hash<Register>
+  {
+    std::size_t operator()(const Register& r) const
+    {
+         return hash<unsigned>()(r);
+    }
+  };
+}
+
+static inline void hoist_unrelated_copies(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineInstr &branch_MI : MBB) {
+      if (!branch_MI.isBranch())
+        continue;
+
+      unordered_set<Register> related_copy_sources;
+      Epilog_Iterator epilog_it = branch_MI.getIterator();
+      Epilog_Iterator copy_move_it = ++epilog_it;
+      while (!epilog_it.isEnd()) {
+        if (epilog_it->getOpcode() != AMDGPU::COPY)
+          related_copy_sources.insert(epilog_it->getOperand(0).getReg());
+        ++epilog_it;
+      }
+
+      while (!copy_move_it.isEnd()) {
+        Epilog_Iterator next = copy_move_it; ++next;
+        if (copy_move_it->getOpcode() == AMDGPU::COPY &&
+            !related_copy_sources.count(copy_move_it->getOperand(1).getReg())) {
+          MachineInstr &MI_to_move = *copy_move_it;
+          MI_to_move.removeFromBundle();
+          MBB.insert(branch_MI.getIterator(),&MI_to_move);
+        }
+        
+        copy_move_it = next;
+      }
+    }
+}
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index ea3d2ed61a808..c6b8ace9d9718 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -851,6 +851,12 @@ bool SILowerControlFlow::run(MachineFunction &MF) {
   LoweredIf.clear();
   KillBlocks.clear();
 
+  if (Changed)
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB)
+        if (MI.isBundled())
+          MI.unbundleFromSucc();
+
   return Changed;
 }
 

>From 89db2df158c33b463e1bf9859d934cc5ec401973 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 18 Sep 2025 03:12:03 -0500
Subject: [PATCH 6/7] Works on simple testcase

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  3 ++
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  3 ++
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |  1 +
 .../lib/Target/AMDGPU/SICustomBranchBundles.h | 14 +++---
 .../Target/AMDGPU/SIRestoreNormalEpilog.cpp   | 46 +++++++++++++++++++
 5 files changed, 60 insertions(+), 7 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/SIRestoreNormalEpilog.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0f2c33585884f..f402efb2ea86e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -248,6 +248,9 @@ extern char &AMDGPUPreloadKernArgPrologLegacyID;
 void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
 extern char &AMDGPUPreloadKernelArgumentsLegacyID;
 
+void initializeSIRestoreNormalEpilogLegacyPass(PassRegistry &);
+extern char &SIRestoreNormalEpilogLegacyID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index fa454730364c4..8210ad88b3d3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -597,6 +597,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIOptimizeExecMaskingLegacyPass(*PR);
   initializeSIPreAllocateWWMRegsLegacyPass(*PR);
   initializeSIFormMemoryClausesLegacyPass(*PR);
+  initializeSIRestoreNormalEpilogLegacyPass(*PR);
   initializeSIPostRABundlerLegacyPass(*PR);
   initializeGCNCreateVOPDLegacyPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
@@ -1595,6 +1596,8 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
+  
+  insertPass(&RenameIndependentSubregsID,&SIRestoreNormalEpilogLegacyID);
 
   if (isPassEnabled(EnablePreRAOptimizations))
     insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index aae56eef73edd..996cb635b3c9c 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -183,6 +183,7 @@ add_llvm_target(AMDGPUCodeGen
   SIPreEmitPeephole.cpp
   SIProgramInfo.cpp
   SIRegisterInfo.cpp
+  SIRestoreNormalEpilog.cpp
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
 
diff --git a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
index 18888975bdba0..0f33a10ed8f33 100644
--- a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
+++ b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
@@ -91,9 +91,7 @@ get_epilog_for_successor(MachineBasicBlock& pred_MBB, MachineBasicBlock& succ_MB
 
   for (MachineInstr &branch_MI : reverse(pred_MBB.instrs()))
     if (branch_MI.isBranch() && TII.getBranchDestBlock(branch_MI) == &succ_MBB)
-      return std::next(branch_MI.getIterator());
-    else
-      assert(branch_MI.isBranch() && "Shouldn't have fall-throughs here.");
+      return ++Epilog_Iterator(branch_MI.getIterator());
 
   llvm_unreachable("There should always be a branch to succ_MBB.");
 }
@@ -132,8 +130,8 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     CFG_Rewrite_Entry to_insert = {{}, &MBB, {}};
     for (MachineBasicBlock *pred_MBB : MBB.predecessors()) {
-      MachineBasicBlock::instr_iterator ep_it =
-          get_epilog_for_successor(*pred_MBB, MBB)->getIterator();
+      Epilog_Iterator ep_it =
+          get_epilog_for_successor(*pred_MBB, MBB);
 
       vector<MachineInstr *> epilog;
       while (!ep_it.isEnd())
@@ -175,6 +173,7 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
   // Perform the journaled rewrites.
   for (auto &entry : cfg_rewrite_entries) {
     MachineBasicBlock *mezzanine_MBB = MF.CreateMachineBasicBlock();
+    MF.insert(MF.end(),mezzanine_MBB);
 
     // Deal with mezzanine to successor succession.
     BuildMI(mezzanine_MBB, DebugLoc(), TII.get(AMDGPU::S_BRANCH)).addMBB(entry.succ_MBB);
@@ -192,7 +191,7 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
       pred_MBB->replaceSuccessor(entry.succ_MBB, mezzanine_MBB);
 
       // Delete instructions that were lowered from epilog
-      auto epilog_it = get_epilog_for_successor(*pred_MBB, *entry.succ_MBB);
+      auto epilog_it = ++Epilog_Iterator(branch_ins.getIterator());
       while (!epilog_it.isEnd())
         epilog_it++->eraseFromBundle();
     }
@@ -228,7 +227,8 @@ static inline void hoist_unrelated_copies(MachineFunction &MF) {
       while (!copy_move_it.isEnd()) {
         Epilog_Iterator next = copy_move_it; ++next;
         if (copy_move_it->getOpcode() == AMDGPU::COPY &&
-            !related_copy_sources.count(copy_move_it->getOperand(1).getReg())) {
+            !related_copy_sources.count(copy_move_it->getOperand(1).getReg())
+            || copy_move_it->getOpcode() == AMDGPU::IMPLICIT_DEF) {
           MachineInstr &MI_to_move = *copy_move_it;
           MI_to_move.removeFromBundle();
           MBB.insert(branch_MI.getIterator(),&MI_to_move);
diff --git a/llvm/lib/Target/AMDGPU/SIRestoreNormalEpilog.cpp b/llvm/lib/Target/AMDGPU/SIRestoreNormalEpilog.cpp
new file mode 100644
index 0000000000000..ef565a9d9ad8f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIRestoreNormalEpilog.cpp
@@ -0,0 +1,46 @@
+#include "SICustomBranchBundles.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-restore-normal-epilog"
+
+namespace
+{
+
+class SIRestoreNormalEpilogLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SIRestoreNormalEpilogLegacy() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    hoist_unrelated_copies(MF);
+    normalize_ir_post_phi_elimination(MF);
+    return true;
+  }
+
+  StringRef getPassName() const override {
+    return "SI Restore Normal Epilog Post PHI Elimination";
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().setNoPHIs();
+  }
+
+};
+
+} // namespace
+
+INITIALIZE_PASS(SIRestoreNormalEpilogLegacy, DEBUG_TYPE,
+                "SI restore normal epilog", false, false)
+
+char SIRestoreNormalEpilogLegacy::ID;
+char &llvm::SIRestoreNormalEpilogLegacyID = SIRestoreNormalEpilogLegacy::ID;

>From 4d4f71aee04526c30a8cf761f42cfa20e55882de Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 18 Sep 2025 06:31:57 -0500
Subject: [PATCH 7/7] Must delete from all predecessors if lowering.

---
 llvm/lib/Target/AMDGPU/SICustomBranchBundles.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
index 0f33a10ed8f33..0e043b09fa136 100644
--- a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
+++ b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
@@ -163,9 +163,18 @@ static inline void normalize_ir_post_phi_elimination(MachineFunction &MF) {
 
     // Handle the last potential rewrite entry.  Lower instead of journaling a
     // rewrite entry if all predecessor MBBs are in this single entry.
-    if (to_insert.pred_MBBs.size() == MBB.pred_size())
-      // Lower
-      move_body(to_insert.body,MBB);
+    if (to_insert.pred_MBBs.size() == MBB.pred_size()) {
+      move_body(to_insert.body, MBB);
+      for (MachineBasicBlock *pred_MBB : to_insert.pred_MBBs) {
+        // Delete instructions that were lowered from epilog
+        MachineInstr &branch_ins =
+          get_branch_with_dest(*pred_MBB, *to_insert.succ_MBB);
+        auto epilog_it = ++Epilog_Iterator(branch_ins.getIterator());
+        while (!epilog_it.isEnd())
+          epilog_it++->eraseFromBundle();
+      }
+
+    }
     else if (to_insert.body.size())
       cfg_rewrite_entries.push_back(to_insert);
   }



More information about the llvm-commits mailing list