[llvm] [AMDGPU][AMDGPUDemoteSCCBranchToExecz] demote s_cbranch_scc0/1 branches into vcmp + s_cbranch_execz branches (PR #110284)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 27 08:29:23 PDT 2024
Juan Manuel Martinez =?utf-8?q?Caamaño?= <juamarti at amd.com>,
Juan Manuel Martinez =?utf-8?q?Caamaño?= <juamarti at amd.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/110284 at github.com>
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Juan Manuel Martinez Caamaño (jmmartinez)
<details>
<summary>Changes</summary>
Follow up from https://github.com/llvm/llvm-project/pull/109818, only the last 2 commits matter
---
Patch is 73.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110284.diff
28 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+11)
- (added) llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp (+306)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+2-1)
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+90)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+14-4)
- (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+10-48)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll (+43-25)
- (modified) llvm/test/CodeGen/AMDGPU/branch-condition-and.ll (+31-15)
- (modified) llvm/test/CodeGen/AMDGPU/else.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll (+2-1)
- (removed) llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir (-95)
- (modified) llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir (+26-4)
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll (+6-12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll (+6-12)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+21-44)
- (modified) llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/ret_jump.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll (+9-4)
- (modified) llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll (+1-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4abb5a63ab6d2c..0eb415e66fe170 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -369,6 +369,17 @@ extern char &AMDGPUCodeGenPrepareID;
void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &);
extern char &AMDGPURemoveIncompatibleFunctionsID;
+void initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(PassRegistry &);
+extern char &AMDGPUDemoteSCCBranchToExeczLegacyID;
+
+class AMDGPUDemoteSCCBranchToExeczPass
+ : public PassInfoMixin<AMDGPUDemoteSCCBranchToExeczPass> {
+public:
+ AMDGPUDemoteSCCBranchToExeczPass() = default;
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &);
extern char &AMDGPULateCodeGenPrepareLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
new file mode 100644
index 00000000000000..f5987cc629ce8d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
@@ -0,0 +1,306 @@
+#include <llvm/CodeGen/MachineFunctionPass.h>
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+#define DEBUG_TYPE "amdgpu-demote-scc-to-execz"
+const char PassName[] = "AMDGPU s_cbranch_scc to s_cbranch_execz conversion";
+
+std::optional<unsigned> getVALUOpc(const MachineInstr &MI,
+ bool Reverse = false) {
+ unsigned Opc = MI.getOpcode();
+ if (Reverse) {
+ switch (Opc) {
+ case AMDGPU::S_CMP_EQ_I32:
+ Opc = AMDGPU::S_CMP_LG_I32;
+ break;
+ case AMDGPU::S_CMP_LG_I32:
+ Opc = AMDGPU::S_CMP_EQ_I32;
+ break;
+ case AMDGPU::S_CMP_GT_I32:
+ Opc = AMDGPU::S_CMP_LE_I32;
+ break;
+ case AMDGPU::S_CMP_GE_I32:
+ Opc = AMDGPU::S_CMP_LT_I32;
+ break;
+ case AMDGPU::S_CMP_LT_I32:
+ Opc = AMDGPU::S_CMP_GE_I32;
+ break;
+ case AMDGPU::S_CMP_LE_I32:
+ Opc = AMDGPU::S_CMP_GT_I32;
+ break;
+ case AMDGPU::S_CMP_EQ_U32:
+ Opc = AMDGPU::S_CMP_LG_U32;
+ break;
+ case AMDGPU::S_CMP_LG_U32:
+ Opc = AMDGPU::S_CMP_EQ_U32;
+ break;
+ case AMDGPU::S_CMP_GT_U32:
+ Opc = AMDGPU::S_CMP_LE_U32;
+ break;
+ case AMDGPU::S_CMP_GE_U32:
+ Opc = AMDGPU::S_CMP_LT_U32;
+ break;
+ case AMDGPU::S_CMP_LT_U32:
+ Opc = AMDGPU::S_CMP_GE_U32;
+ break;
+ case AMDGPU::S_CMP_LE_U32:
+ Opc = AMDGPU::S_CMP_GT_U32;
+ break;
+ case AMDGPU::S_CMP_EQ_U64:
+ Opc = AMDGPU::S_CMP_LG_U64;
+ break;
+ case AMDGPU::S_CMP_LG_U64:
+ Opc = AMDGPU::S_CMP_EQ_U64;
+ break;
+ default:
+ return std::nullopt;
+ }
+ }
+
+ switch (Opc) {
+ case AMDGPU::S_CMP_EQ_I32:
+ return AMDGPU::V_CMP_EQ_I32_e64;
+ case AMDGPU::S_CMP_LG_I32:
+ return AMDGPU::V_CMP_LT_I32_e64;
+ case AMDGPU::S_CMP_GT_I32:
+ return AMDGPU::V_CMP_GT_I32_e64;
+ case AMDGPU::S_CMP_GE_I32:
+ return AMDGPU::V_CMP_GE_I32_e64;
+ case AMDGPU::S_CMP_LT_I32:
+ return AMDGPU::V_CMP_LT_I32_e64;
+ case AMDGPU::S_CMP_LE_I32:
+ return AMDGPU::V_CMP_LE_I32_e64;
+ case AMDGPU::S_CMP_EQ_U32:
+ return AMDGPU::V_CMP_EQ_U32_e64;
+ case AMDGPU::S_CMP_LG_U32:
+ return AMDGPU::V_CMP_NE_U32_e64;
+ case AMDGPU::S_CMP_GT_U32:
+ return AMDGPU::V_CMP_GT_U32_e64;
+ case AMDGPU::S_CMP_GE_U32:
+ return AMDGPU::V_CMP_GE_U32_e64;
+ case AMDGPU::S_CMP_LT_U32:
+ return AMDGPU::V_CMP_LT_U32_e64;
+ case AMDGPU::S_CMP_LE_U32:
+ return AMDGPU::V_CMP_LE_U32_e64;
+ case AMDGPU::S_CMP_EQ_U64:
+ return AMDGPU::V_CMP_EQ_U64_e64;
+ case AMDGPU::S_CMP_LG_U64:
+ return AMDGPU::V_CMP_NE_U64_e64;
+ default:
+ return std::nullopt;
+ }
+}
+
+bool isSCmpPromotableToVCmp(const MachineInstr &MI) {
+ return getVALUOpc(MI).has_value();
+}
+
+bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then,
+ MachineBasicBlock *&Tail) {
+ if (Head.succ_size() != 2)
+ return false;
+
+ Then = Head.succ_begin()[0];
+ Tail = Head.succ_begin()[1];
+
+ // Canonicalize so Succ0 has MBB as its single predecessor.
+ if (Then->pred_size() != 1)
+ std::swap(Then, Tail);
+
+ if (Then->pred_size() != 1 || Then->succ_size() != 1)
+ return false;
+
+ return *Then->succ_begin() == Tail;
+}
+
+bool hasPromotableCmpConditon(MachineInstr &Term, MachineInstr *&Cmp) {
+ auto CmpIt = std::next(Term.getReverseIterator());
+ if (CmpIt == Term.getParent()->instr_rend())
+ return false;
+
+ if (!isSCmpPromotableToVCmp(*CmpIt))
+ return false;
+
+ Cmp = &*CmpIt;
+ return true;
+}
+
+bool hasCbranchSCCTerm(MachineBasicBlock &Head, MachineInstr *&Term) {
+ auto TermIt = Head.getFirstInstrTerminator();
+ if (TermIt == Head.end())
+ return false;
+
+ switch (TermIt->getOpcode()) {
+ case AMDGPU::S_CBRANCH_SCC0:
+ case AMDGPU::S_CBRANCH_SCC1:
+ Term = &*TermIt;
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isTriangularSCCBranch(MachineBasicBlock &Head, MachineInstr *&Term,
+ MachineInstr *&Cmp, MachineBasicBlock *&Then,
+ MachineBasicBlock *&Tail) {
+
+ if (!hasCbranchSCCTerm(Head, Term))
+ return false;
+
+ if (!isTriangular(Head, Then, Tail))
+ return false;
+
+ // phi-nodes in the tail can prevent splicing the instructions of the then
+ // and tail blocks in the head
+ if (!Tail->empty() && Tail->begin()->isPHI())
+ return false;
+
+ if (!hasPromotableCmpConditon(*Term, Cmp))
+ return false;
+
+ return true;
+}
+
+bool SCC1JumpsToThen(const MachineInstr &Term, const MachineBasicBlock &Then) {
+ MachineBasicBlock *TBB = Term.getOperand(0).getMBB();
+ return (TBB == &Then) == (Term.getOpcode() == AMDGPU::S_CBRANCH_SCC1);
+}
+
+class AMDGPUDemoteSCCBranchToExecz {
+ MachineFunction &MF;
+ const GCNSubtarget &ST;
+ const SIInstrInfo &TII;
+ const SIRegisterInfo &RegInfo;
+ const TargetSchedModel &SchedModel;
+
+public:
+ AMDGPUDemoteSCCBranchToExecz(MachineFunction &MF)
+ : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ RegInfo(*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
+
+ bool mustRetainSCCBranch(const MachineInstr &Term, const MachineInstr &Cmp,
+ const MachineBasicBlock &Then,
+ const MachineBasicBlock &Tail) {
+ bool IsWave32 = TII.isWave32();
+ unsigned AndSaveExecOpc =
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned NewOps[] = {*getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)),
+ AndSaveExecOpc, Mov};
+ unsigned NewOpsCost = 0;
+ for (unsigned Opc : NewOps)
+ NewOpsCost += SchedModel.computeInstrLatency(Opc);
+ unsigned OldCmpCost = SchedModel.computeInstrLatency(&Cmp, false);
+
+ assert(NewOpsCost >= OldCmpCost);
+ return !TII.mustRetainExeczBranch(*Term.getParent(), Then, Tail,
+ NewOpsCost - OldCmpCost);
+ }
+
+ void demoteCmp(MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
+ MachineBasicBlock &Then, MachineBasicBlock &Tail) {
+ unsigned NewCmpOpc = *getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then));
+ Cmp.setDesc(TII.get(NewCmpOpc));
+
+ MachineOperand L = Cmp.getOperand(0);
+ MachineOperand R = Cmp.getOperand(1);
+ for (unsigned i = 3; i != 0; --i)
+ Cmp.removeOperand(i - 1);
+
+ auto VCC = RegInfo.getVCC();
+ auto Exec = RegInfo.getExec();
+
+ auto &MRI = MF.getRegInfo();
+ MCRegister ExecBackup =
+ MRI.createVirtualRegister(RegInfo.getPhysRegBaseClass(Exec));
+
+ Cmp.addOperand(MachineOperand::CreateReg(VCC, true));
+ Cmp.addOperand(L);
+ Cmp.addOperand(R);
+ Cmp.addImplicitDefUseOperands(MF);
+
+ TII.legalizeOperands(Cmp);
+
+ bool IsWave32 = TII.isWave32();
+ unsigned AndSaveExecOpc =
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ auto SaveAndMaskExec = BuildMI(*Term.getParent(), Term, Cmp.getDebugLoc(),
+ TII.get(AndSaveExecOpc), ExecBackup);
+ SaveAndMaskExec.addReg(VCC, RegState::Kill);
+ SaveAndMaskExec->getOperand(3).setIsDead(); // mark SCC as dead
+
+ DebugLoc DL = Term.getDebugLoc();
+ TII.removeBranch(Head);
+ MachineOperand Cond[] = {
+ MachineOperand::CreateImm(SIInstrInfo::BranchPredicate::EXECZ),
+ MachineOperand::CreateReg(RegInfo.getExec(), false)};
+ TII.insertBranch(Head, &Tail, &Then, Cond, DL);
+
+ TII.restoreExec(MF, Tail, Tail.instr_begin(), DebugLoc(), ExecBackup);
+ }
+
+ bool run() {
+ if (!SchedModel.hasInstrSchedModel())
+ return false;
+ bool Changed = false;
+
+ for (MachineBasicBlock &Head : MF) {
+ MachineInstr *Term;
+ MachineInstr *Cmp;
+ MachineBasicBlock *Then;
+ MachineBasicBlock *Tail;
+ if (!isTriangularSCCBranch(Head, Term, Cmp, Then, Tail))
+ continue;
+
+ if (!mustRetainSCCBranch(*Term, *Cmp, *Then, *Tail))
+ continue;
+
+ demoteCmp(*Term, *Cmp, Head, *Then, *Tail);
+ Changed = true;
+ }
+ return Changed;
+ }
+};
+
+class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
+ return IfCvt.run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return PassName; }
+};
+
+char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
+
+} // namespace
+
+PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run(
+ MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
+ if (!IfCvt.run())
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+char &llvm::AMDGPUDemoteSCCBranchToExeczLegacyID =
+ AMDGPUDemoteSCCBranchToExeczLegacy::ID;
+INITIALIZE_PASS_BEGIN(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName,
+ false, false)
+INITIALIZE_PASS_END(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName,
+ false, false)
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 0ebf34c901c142..d968ac61eea39d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -95,6 +95,7 @@ FUNCTION_PASS_WITH_PARAMS(
#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-demote-scc-to-execz", AMDGPUDemoteSCCBranchToExeczPass())
MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index abd50748f2cc05..2c9cc95bf8b0bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -444,6 +444,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowLegacyPass(*PR);
+ initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
@@ -1283,7 +1284,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
bool GCNPassConfig::addILPOpts() {
if (EnableEarlyIfConversion)
addPass(&EarlyIfConverterID);
-
+ addPass(&AMDGPUDemoteSCCBranchToExeczLegacyID);
TargetPassConfig::addILPOpts();
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 7c883cc2017ddd..180fb7ac0c5153 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUGlobalISelDivergenceLowering.cpp
AMDGPUGlobalISelUtils.cpp
AMDGPUHSAMetadataStreamer.cpp
+ AMDGPUDemoteSCCBranchToExecz.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
AMDGPUInstrInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 44ee5c56a237b4..24fe598607688e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4115,6 +4115,96 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
}
+namespace {
+class BranchWeightCostModel {
+ const SIInstrInfo &TII;
+ const TargetSchedModel &SchedModel;
+ BranchProbability BranchProb;
+ uint64_t BranchCost;
+ uint64_t ThenCyclesCost;
+
+public:
+ BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
+ const MachineBasicBlock &Succ,
+ unsigned ExtraTransformationCosts = 0)
+ : TII(TII), SchedModel(TII.getSchedModel()),
+ ThenCyclesCost(ExtraTransformationCosts) {
+ assert(SchedModel.hasInstrSchedModelOrItineraries());
+
+ const MachineBasicBlock &Head = *Branch.getParent();
+ const auto *FromIt = find(Head.successors(), &Succ);
+ assert(FromIt != Head.succ_end());
+
+ BranchProb = Head.getSuccProbability(FromIt);
+ if (BranchProb.isUnknown())
+ return;
+
+ BranchCost = SchedModel.computeInstrLatency(&Branch, false);
+ }
+
+ bool isUnknown() const { return BranchProb.isUnknown(); }
+
+ bool isProfitable(const MachineInstr &MI) {
+ assert(!isUnknown());
+
+ if (TII.isWaitcnt(MI.getOpcode()))
+ return false;
+
+ ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
+
+ // Consider `P = N/D` to be the probability of execnz being true
+ // The transformation is profitable if always executing the 'then' block
+ // is cheaper than executing sometimes 'then' and always
+ // executing s_cbranch_execnz:
+ // * ThenCost + Extra <= P*ThenCost + BranchCost
+ // * (1-P) * (ThenCost + Extra) <= BranchCost
+ // * (D-N)/D * (ThenCost + Extra) <= BranchCost
+ uint64_t Numerator = BranchProb.getNumerator();
+ uint64_t Denominator = BranchProb.getDenominator();
+ return (Denominator - Numerator) * ThenCyclesCost <=
+ Denominator * BranchCost;
+ }
+};
+} // namespace
+
+bool SIInstrInfo::mustRetainExeczBranch(
+ const MachineBasicBlock &Head, const MachineBasicBlock &From,
+ const MachineBasicBlock &To, unsigned ExtraTransformationCosts) const {
+
+ const auto *FromIt = find(Head.successors(), &From);
+ assert(FromIt != Head.succ_end());
+
+ BranchWeightCostModel CostModel{*this, *Head.getFirstTerminator(), From,
+ ExtraTransformationCosts};
+ if (CostModel.isUnknown())
+ return true;
+
+ const MachineFunction *MF = From.getParent();
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (const MachineInstr &MI : MBB) {
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might never be taken when EXEC = 0.
+ // Hence we should retain cbranch out of the loop lest it become infinite.
+ if (MI.isConditionalBranch())
+ return true;
+
+ if (MI.isMetaInstruction())
+ continue;
+
+ if (hasUnwantedEffectsWhenEXECEmpty(MI))
+ return true;
+
+ if (!CostModel.isProfitable(MI))
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f7554906a9c98b..78d75389f5a866 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -87,6 +87,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
TargetSchedModel SchedModel;
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
+public:
// The inverse predicate should have the negative value.
enum BranchPredicate {
INVALID_BR = 0,
@@ -98,6 +99,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
EXECZ = 3
};
+private:
using SetVectorType = SmallSetVector<MachineInstr *, 32>;
static unsigned getBranchOpcode(BranchPredicate Cond);
@@ -1031,13 +1033,21 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// Return true if the instruction modifies the mode register.q
static bool modifiesModeRegister(const MachineInstr &MI);
+ /// Returns true if it's protifable to remove an execz branch from Head to
+ /// From
+ bool mustRetainExeczBranch(const MachineBasicBlock &Head,
+ const MachineBasicBlock &From,
+ const MachineBasicBlock &To,
+ unsigned ExtraTransformationCosts = 0) const;
+
/// This function is used to determine if an instruction can be safely
/// executed under EXEC = 0 without hardware error, indeterminate results,
/// and/or visible effects on future vector execution or outside the shader.
- /// Note: as of 2024 the only use of this is SIPreEmitPeephole where it is
- /// used in removing branches over short EXEC = 0 sequences.
- /// As such it embeds certain assumptions which may not apply to every case
- /// of EXEC = 0 execution.
+ /// Note: as of 2024 the only use of this is SIPreEmitPeephole and
+ /// AMDGPUDemoteSCCBranchToExecz (through SIIInstrInfo::mustRetainExeczBranch)
+ /// where it is used in removing branches over short EXEC = 0 sequences. As
+ /// such it embeds certain assumptions which may not apply to every case of
+ /// EXEC = 0 execution.
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
/// Returns true if the instruction could potentially depend on the value of
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 1334029544f999..1c71c245ed79d0 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -15,19 +15,12 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSchedule.h"
using namespace llvm;
#define DEBUG_TYPE "si-pre-emit-peephole"
-static unsigned SkipThreshold;
-
-static cl::opt<unsigned, true> SkipThresholdFlag(
- "amdgpu-skip-threshold", cl::Hidden,
- cl::desc(
- "Number of instructions before jumping over divergent control flow"),
- cl::location(SkipThreshold), cl::init(12));
-
namespace {
class SIPreEmitPeephole : public MachineFunctionPass {
@@ -41,8 +34,6 @@ class SIPreEmitPeephole : public MachineFunctionPass {
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
- bool mustRetainExeczBranch(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
public:
@@ -304,45 +295,13 @@ bool SIPreEmitPeephole::getBlockDestinations(
return true;
}
-bool SIPreEmitPeephole::mustRetainExeczBranch(
- const MachineBasicBlock &From, const MachineBasicBlock &To) const {
- unsigned NumInstr = 0;
- const MachineFunction *MF = From.getParent();
-
- for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- const MachineBasicBlock &MBB = *MBBI;
-
- for (const MachineInstr &MI : MBB) {
- // When a uniform loop is inside non-uniform c...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/110284
More information about the llvm-commits
mailing list