[llvm] [AMDGPU][AMDGPUDemoteSCCBranchToExecz] demote s_cbranch_scc0/1 branches into vcmp + s_cbranch_execz branches (PR #110284)

Fri Sep 27 08:29:23 PDT 2024

Juan Manuel Martinez =?utf-8?q?Caamaño?= <juamarti at amd.com>,
Juan Manuel Martinez =?utf-8?q?Caamaño?= <juamarti at amd.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/110284 at github.com>


llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Juan Manuel Martinez Caamaño (jmmartinez)

<details>
<summary>Changes</summary>

Follow up from https://github.com/llvm/llvm-project/pull/109818, only the last 2 commits matter

---

Patch is 73.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110284.diff


28 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+11) 
- (added) llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp (+306) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+2-1) 
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+90) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+14-4) 
- (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+10-48) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll (+43-25) 
- (modified) llvm/test/CodeGen/AMDGPU/branch-condition-and.ll (+31-15) 
- (modified) llvm/test/CodeGen/AMDGPU/else.ll (-1) 
- (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+2-4) 
- (modified) llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll (+2-1) 
- (removed) llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir (-95) 
- (modified) llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir (+26-4) 
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+4) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll (+6-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll (+6-12) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+21-44) 
- (modified) llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/ret_jump.ll (-1) 
- (modified) llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll (+9-4) 
- (modified) llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll (+1-2) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4abb5a63ab6d2c..0eb415e66fe170 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -369,6 +369,17 @@ extern char &AMDGPUCodeGenPrepareID;
 void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &);
 extern char &AMDGPURemoveIncompatibleFunctionsID;
 
+void initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(PassRegistry &);
+extern char &AMDGPUDemoteSCCBranchToExeczLegacyID;
+
+class AMDGPUDemoteSCCBranchToExeczPass
+    : public PassInfoMixin<AMDGPUDemoteSCCBranchToExeczPass> {
+public:
+  AMDGPUDemoteSCCBranchToExeczPass() = default;
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
 void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &);
 extern char &AMDGPULateCodeGenPrepareLegacyID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
new file mode 100644
index 00000000000000..f5987cc629ce8d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
@@ -0,0 +1,306 @@
+#include <llvm/CodeGen/MachineFunctionPass.h>
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+#define DEBUG_TYPE "amdgpu-demote-scc-to-execz"
+const char PassName[] = "AMDGPU s_cbranch_scc to s_cbranch_execz conversion";
+
+std::optional<unsigned> getVALUOpc(const MachineInstr &MI,
+                                   bool Reverse = false) {
+  unsigned Opc = MI.getOpcode();
+  if (Reverse) {
+    switch (Opc) {
+    case AMDGPU::S_CMP_EQ_I32:
+      Opc = AMDGPU::S_CMP_LG_I32;
+      break;
+    case AMDGPU::S_CMP_LG_I32:
+      Opc = AMDGPU::S_CMP_EQ_I32;
+      break;
+    case AMDGPU::S_CMP_GT_I32:
+      Opc = AMDGPU::S_CMP_LE_I32;
+      break;
+    case AMDGPU::S_CMP_GE_I32:
+      Opc = AMDGPU::S_CMP_LT_I32;
+      break;
+    case AMDGPU::S_CMP_LT_I32:
+      Opc = AMDGPU::S_CMP_GE_I32;
+      break;
+    case AMDGPU::S_CMP_LE_I32:
+      Opc = AMDGPU::S_CMP_GT_I32;
+      break;
+    case AMDGPU::S_CMP_EQ_U32:
+      Opc = AMDGPU::S_CMP_LG_U32;
+      break;
+    case AMDGPU::S_CMP_LG_U32:
+      Opc = AMDGPU::S_CMP_EQ_U32;
+      break;
+    case AMDGPU::S_CMP_GT_U32:
+      Opc = AMDGPU::S_CMP_LE_U32;
+      break;
+    case AMDGPU::S_CMP_GE_U32:
+      Opc = AMDGPU::S_CMP_LT_U32;
+      break;
+    case AMDGPU::S_CMP_LT_U32:
+      Opc = AMDGPU::S_CMP_GE_U32;
+      break;
+    case AMDGPU::S_CMP_LE_U32:
+      Opc = AMDGPU::S_CMP_GT_U32;
+      break;
+    case AMDGPU::S_CMP_EQ_U64:
+      Opc = AMDGPU::S_CMP_LG_U64;
+      break;
+    case AMDGPU::S_CMP_LG_U64:
+      Opc = AMDGPU::S_CMP_EQ_U64;
+      break;
+    default:
+      return std::nullopt;
+    }
+  }
+
+  switch (Opc) {
+  case AMDGPU::S_CMP_EQ_I32:
+    return AMDGPU::V_CMP_EQ_I32_e64;
+  case AMDGPU::S_CMP_LG_I32:
+    return AMDGPU::V_CMP_LT_I32_e64;
+  case AMDGPU::S_CMP_GT_I32:
+    return AMDGPU::V_CMP_GT_I32_e64;
+  case AMDGPU::S_CMP_GE_I32:
+    return AMDGPU::V_CMP_GE_I32_e64;
+  case AMDGPU::S_CMP_LT_I32:
+    return AMDGPU::V_CMP_LT_I32_e64;
+  case AMDGPU::S_CMP_LE_I32:
+    return AMDGPU::V_CMP_LE_I32_e64;
+  case AMDGPU::S_CMP_EQ_U32:
+    return AMDGPU::V_CMP_EQ_U32_e64;
+  case AMDGPU::S_CMP_LG_U32:
+    return AMDGPU::V_CMP_NE_U32_e64;
+  case AMDGPU::S_CMP_GT_U32:
+    return AMDGPU::V_CMP_GT_U32_e64;
+  case AMDGPU::S_CMP_GE_U32:
+    return AMDGPU::V_CMP_GE_U32_e64;
+  case AMDGPU::S_CMP_LT_U32:
+    return AMDGPU::V_CMP_LT_U32_e64;
+  case AMDGPU::S_CMP_LE_U32:
+    return AMDGPU::V_CMP_LE_U32_e64;
+  case AMDGPU::S_CMP_EQ_U64:
+    return AMDGPU::V_CMP_EQ_U64_e64;
+  case AMDGPU::S_CMP_LG_U64:
+    return AMDGPU::V_CMP_NE_U64_e64;
+  default:
+    return std::nullopt;
+  }
+}
+
+bool isSCmpPromotableToVCmp(const MachineInstr &MI) {
+  return getVALUOpc(MI).has_value();
+}
+
+bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then,
+                  MachineBasicBlock *&Tail) {
+  if (Head.succ_size() != 2)
+    return false;
+
+  Then = Head.succ_begin()[0];
+  Tail = Head.succ_begin()[1];
+
+  // Canonicalize so Succ0 has MBB as its single predecessor.
+  if (Then->pred_size() != 1)
+    std::swap(Then, Tail);
+
+  if (Then->pred_size() != 1 || Then->succ_size() != 1)
+    return false;
+
+  return *Then->succ_begin() == Tail;
+}
+
+bool hasPromotableCmpConditon(MachineInstr &Term, MachineInstr *&Cmp) {
+  auto CmpIt = std::next(Term.getReverseIterator());
+  if (CmpIt == Term.getParent()->instr_rend())
+    return false;
+
+  if (!isSCmpPromotableToVCmp(*CmpIt))
+    return false;
+
+  Cmp = &*CmpIt;
+  return true;
+}
+
+bool hasCbranchSCCTerm(MachineBasicBlock &Head, MachineInstr *&Term) {
+  auto TermIt = Head.getFirstInstrTerminator();
+  if (TermIt == Head.end())
+    return false;
+
+  switch (TermIt->getOpcode()) {
+  case AMDGPU::S_CBRANCH_SCC0:
+  case AMDGPU::S_CBRANCH_SCC1:
+    Term = &*TermIt;
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool isTriangularSCCBranch(MachineBasicBlock &Head, MachineInstr *&Term,
+                           MachineInstr *&Cmp, MachineBasicBlock *&Then,
+                           MachineBasicBlock *&Tail) {
+
+  if (!hasCbranchSCCTerm(Head, Term))
+    return false;
+
+  if (!isTriangular(Head, Then, Tail))
+    return false;
+
+  // phi-nodes in the tail can prevent splicing the instructions of the then
+  // and tail blocks in the head
+  if (!Tail->empty() && Tail->begin()->isPHI())
+    return false;
+
+  if (!hasPromotableCmpConditon(*Term, Cmp))
+    return false;
+
+  return true;
+}
+
+bool SCC1JumpsToThen(const MachineInstr &Term, const MachineBasicBlock &Then) {
+  MachineBasicBlock *TBB = Term.getOperand(0).getMBB();
+  return (TBB == &Then) == (Term.getOpcode() == AMDGPU::S_CBRANCH_SCC1);
+}
+
+class AMDGPUDemoteSCCBranchToExecz {
+  MachineFunction &MF;
+  const GCNSubtarget &ST;
+  const SIInstrInfo &TII;
+  const SIRegisterInfo &RegInfo;
+  const TargetSchedModel &SchedModel;
+
+public:
+  AMDGPUDemoteSCCBranchToExecz(MachineFunction &MF)
+      : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+        RegInfo(*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
+
+  bool mustRetainSCCBranch(const MachineInstr &Term, const MachineInstr &Cmp,
+                           const MachineBasicBlock &Then,
+                           const MachineBasicBlock &Tail) {
+    bool IsWave32 = TII.isWave32();
+    unsigned AndSaveExecOpc =
+        IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+    unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    unsigned NewOps[] = {*getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)),
+                         AndSaveExecOpc, Mov};
+    unsigned NewOpsCost = 0;
+    for (unsigned Opc : NewOps)
+      NewOpsCost += SchedModel.computeInstrLatency(Opc);
+    unsigned OldCmpCost = SchedModel.computeInstrLatency(&Cmp, false);
+
+    assert(NewOpsCost >= OldCmpCost);
+    return !TII.mustRetainExeczBranch(*Term.getParent(), Then, Tail,
+                                      NewOpsCost - OldCmpCost);
+  }
+
+  void demoteCmp(MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
+                 MachineBasicBlock &Then, MachineBasicBlock &Tail) {
+    unsigned NewCmpOpc = *getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then));
+    Cmp.setDesc(TII.get(NewCmpOpc));
+
+    MachineOperand L = Cmp.getOperand(0);
+    MachineOperand R = Cmp.getOperand(1);
+    for (unsigned i = 3; i != 0; --i)
+      Cmp.removeOperand(i - 1);
+
+    auto VCC = RegInfo.getVCC();
+    auto Exec = RegInfo.getExec();
+
+    auto &MRI = MF.getRegInfo();
+    MCRegister ExecBackup =
+        MRI.createVirtualRegister(RegInfo.getPhysRegBaseClass(Exec));
+
+    Cmp.addOperand(MachineOperand::CreateReg(VCC, true));
+    Cmp.addOperand(L);
+    Cmp.addOperand(R);
+    Cmp.addImplicitDefUseOperands(MF);
+
+    TII.legalizeOperands(Cmp);
+
+    bool IsWave32 = TII.isWave32();
+    unsigned AndSaveExecOpc =
+        IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+    auto SaveAndMaskExec = BuildMI(*Term.getParent(), Term, Cmp.getDebugLoc(),
+                                   TII.get(AndSaveExecOpc), ExecBackup);
+    SaveAndMaskExec.addReg(VCC, RegState::Kill);
+    SaveAndMaskExec->getOperand(3).setIsDead(); // mark SCC as dead
+
+    DebugLoc DL = Term.getDebugLoc();
+    TII.removeBranch(Head);
+    MachineOperand Cond[] = {
+        MachineOperand::CreateImm(SIInstrInfo::BranchPredicate::EXECZ),
+        MachineOperand::CreateReg(RegInfo.getExec(), false)};
+    TII.insertBranch(Head, &Tail, &Then, Cond, DL);
+
+    TII.restoreExec(MF, Tail, Tail.instr_begin(), DebugLoc(), ExecBackup);
+  }
+
+  bool run() {
+    if (!SchedModel.hasInstrSchedModel())
+      return false;
+    bool Changed = false;
+
+    for (MachineBasicBlock &Head : MF) {
+      MachineInstr *Term;
+      MachineInstr *Cmp;
+      MachineBasicBlock *Then;
+      MachineBasicBlock *Tail;
+      if (!isTriangularSCCBranch(Head, Term, Cmp, Then, Tail))
+        continue;
+
+      if (!mustRetainSCCBranch(*Term, *Cmp, *Then, *Tail))
+        continue;
+
+      demoteCmp(*Term, *Cmp, Head, *Then, *Tail);
+      Changed = true;
+    }
+    return Changed;
+  }
+};
+
+class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
+    return IfCvt.run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return PassName; }
+};
+
+char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
+
+} // namespace
+
+PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run(
+    MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
+  AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
+  if (!IfCvt.run())
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+char &llvm::AMDGPUDemoteSCCBranchToExeczLegacyID =
+    AMDGPUDemoteSCCBranchToExeczLegacy::ID;
+INITIALIZE_PASS_BEGIN(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName,
+                      false, false)
+INITIALIZE_PASS_END(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName,
+                    false, false)
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 0ebf34c901c142..d968ac61eea39d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -95,6 +95,7 @@ FUNCTION_PASS_WITH_PARAMS(
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-demote-scc-to-execz", AMDGPUDemoteSCCBranchToExeczPass())
 MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
 MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
 MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index abd50748f2cc05..2c9cc95bf8b0bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -444,6 +444,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
+  initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(*PR);
   initializeAMDGPUInsertDelayAluPass(*PR);
   initializeSIInsertHardClausesPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
@@ -1283,7 +1284,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
 bool GCNPassConfig::addILPOpts() {
   if (EnableEarlyIfConversion)
     addPass(&EarlyIfConverterID);
-
+  addPass(&AMDGPUDemoteSCCBranchToExeczLegacyID);
   TargetPassConfig::addILPOpts();
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 7c883cc2017ddd..180fb7ac0c5153 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
   AMDGPUHSAMetadataStreamer.cpp
+  AMDGPUDemoteSCCBranchToExecz.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
   AMDGPUInstrInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 44ee5c56a237b4..24fe598607688e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4115,6 +4115,96 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
   return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
 }
 
+namespace {
+class BranchWeightCostModel {
+  const SIInstrInfo &TII;
+  const TargetSchedModel &SchedModel;
+  BranchProbability BranchProb;
+  uint64_t BranchCost;
+  uint64_t ThenCyclesCost;
+
+public:
+  BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
+                        const MachineBasicBlock &Succ,
+                        unsigned ExtraTransformationCosts = 0)
+      : TII(TII), SchedModel(TII.getSchedModel()),
+        ThenCyclesCost(ExtraTransformationCosts) {
+    assert(SchedModel.hasInstrSchedModelOrItineraries());
+
+    const MachineBasicBlock &Head = *Branch.getParent();
+    const auto *FromIt = find(Head.successors(), &Succ);
+    assert(FromIt != Head.succ_end());
+
+    BranchProb = Head.getSuccProbability(FromIt);
+    if (BranchProb.isUnknown())
+      return;
+
+    BranchCost = SchedModel.computeInstrLatency(&Branch, false);
+  }
+
+  bool isUnknown() const { return BranchProb.isUnknown(); }
+
+  bool isProfitable(const MachineInstr &MI) {
+    assert(!isUnknown());
+
+    if (TII.isWaitcnt(MI.getOpcode()))
+      return false;
+
+    ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
+
+    // Consider `P = N/D` to be the probability of execnz being true
+    // The transformation is profitable if always executing the 'then' block
+    // is cheaper than executing sometimes 'then' and always
+    // executing s_cbranch_execnz:
+    // * ThenCost + Extra <= P*ThenCost + BranchCost
+    // * (1-P) * (ThenCost + Extra) <= BranchCost
+    // * (D-N)/D * (ThenCost + Extra) <= BranchCost
+    uint64_t Numerator = BranchProb.getNumerator();
+    uint64_t Denominator = BranchProb.getDenominator();
+    return (Denominator - Numerator) * ThenCyclesCost <=
+           Denominator * BranchCost;
+  }
+};
+} // namespace
+
+bool SIInstrInfo::mustRetainExeczBranch(
+    const MachineBasicBlock &Head, const MachineBasicBlock &From,
+    const MachineBasicBlock &To, unsigned ExtraTransformationCosts) const {
+
+  const auto *FromIt = find(Head.successors(), &From);
+  assert(FromIt != Head.succ_end());
+
+  BranchWeightCostModel CostModel{*this, *Head.getFirstTerminator(), From,
+                                  ExtraTransformationCosts};
+  if (CostModel.isUnknown())
+    return true;
+
+  const MachineFunction *MF = From.getParent();
+  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+       MBBI != End && MBBI != ToI; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+
+    for (const MachineInstr &MI : MBB) {
+      // When a uniform loop is inside non-uniform control flow, the branch
+      // leaving the loop might never be taken when EXEC = 0.
+      // Hence we should retain cbranch out of the loop lest it become infinite.
+      if (MI.isConditionalBranch())
+        return true;
+
+      if (MI.isMetaInstruction())
+        continue;
+
+      if (hasUnwantedEffectsWhenEXECEmpty(MI))
+        return true;
+
+      if (!CostModel.isProfitable(MI))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f7554906a9c98b..78d75389f5a866 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -87,6 +87,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   TargetSchedModel SchedModel;
   mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
 
+public:
   // The inverse predicate should have the negative value.
   enum BranchPredicate {
     INVALID_BR = 0,
@@ -98,6 +99,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     EXECZ = 3
   };
 
+private:
   using SetVectorType = SmallSetVector<MachineInstr *, 32>;
 
   static unsigned getBranchOpcode(BranchPredicate Cond);
@@ -1031,13 +1033,21 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   /// Return true if the instruction modifies the mode register.q
   static bool modifiesModeRegister(const MachineInstr &MI);
 
+  /// Returns true if it's protifable to remove an execz branch from Head to
+  /// From
+  bool mustRetainExeczBranch(const MachineBasicBlock &Head,
+                             const MachineBasicBlock &From,
+                             const MachineBasicBlock &To,
+                             unsigned ExtraTransformationCosts = 0) const;
+
   /// This function is used to determine if an instruction can be safely
   /// executed under EXEC = 0 without hardware error, indeterminate results,
   /// and/or visible effects on future vector execution or outside the shader.
-  /// Note: as of 2024 the only use of this is SIPreEmitPeephole where it is
-  /// used in removing branches over short EXEC = 0 sequences.
-  /// As such it embeds certain assumptions which may not apply to every case
-  /// of EXEC = 0 execution.
+  /// Note: as of 2024 the only use of this is SIPreEmitPeephole and
+  /// AMDGPUDemoteSCCBranchToExecz (through SIIInstrInfo::mustRetainExeczBranch)
+  /// where it is used in removing branches over short EXEC = 0 sequences. As
+  /// such it embeds certain assumptions which may not apply to every case of
+  /// EXEC = 0 execution.
   bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
 
   /// Returns true if the instruction could potentially depend on the value of
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 1334029544f999..1c71c245ed79d0 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -15,19 +15,12 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "si-pre-emit-peephole"
 
-static unsigned SkipThreshold;
-
-static cl::opt<unsigned, true> SkipThresholdFlag(
-    "amdgpu-skip-threshold", cl::Hidden,
-    cl::desc(
-        "Number of instructions before jumping over divergent control flow"),
-    cl::location(SkipThreshold), cl::init(12));
-
 namespace {
 
 class SIPreEmitPeephole : public MachineFunctionPass {
@@ -41,8 +34,6 @@ class SIPreEmitPeephole : public MachineFunctionPass {
                             MachineBasicBlock *&TrueMBB,
                             MachineBasicBlock *&FalseMBB,
                             SmallVectorImpl<MachineOperand> &Cond);
-  bool mustRetainExeczBranch(const MachineBasicBlock &From,
-                             const MachineBasicBlock &To) const;
   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
 
 public:
@@ -304,45 +295,13 @@ bool SIPreEmitPeephole::getBlockDestinations(
   return true;
 }
 
-bool SIPreEmitPeephole::mustRetainExeczBranch(
-    const MachineBasicBlock &From, const MachineBasicBlock &To) const {
-  unsigned NumInstr = 0;
-  const MachineFunction *MF = From.getParent();
-
-  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
-       MBBI != End && MBBI != ToI; ++MBBI) {
-    const MachineBasicBlock &MBB = *MBBI;
-
-    for (const MachineInstr &MI : MBB) {
-      // When a uniform loop is inside non-uniform c...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/110284