[llvm] [AMDGPU][AMDGPUDemoteSCCBranchToExecz] demote s_cbranch_scc0/1 branches into vcmp + s_cbranch_execz branches (PR #110284)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 14 07:56:23 PDT 2024
https://github.com/jmmartinez updated https://github.com/llvm/llvm-project/pull/110284
>From cc8ceabcce69b34af532e27eabea40d78b80ab36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Fri, 20 Sep 2024 15:31:49 +0200
Subject: [PATCH 1/2] [AMDGPU][AMDGPUDemoteSCCBranchToExecz] create new pass
(boilerplate only)
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 3 ++
.../AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp | 54 +++++++++++++++++++
.../AMDGPU/AMDGPUDemoteSCCBranchToExecz.h | 31 +++++++++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 ++
7 files changed, 97 insertions(+), 1 deletion(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.h
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 342d55e828bca5..e7515c16e44b5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -373,6 +373,9 @@ extern char &AMDGPUCodeGenPrepareID;
void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &);
extern char &AMDGPURemoveIncompatibleFunctionsID;
+void initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(PassRegistry &);
+extern char &AMDGPUDemoteSCCBranchToExeczLegacyID;
+
void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &);
extern char &AMDGPULateCodeGenPrepareLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
new file mode 100644
index 00000000000000..112de9f7943422
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
@@ -0,0 +1,54 @@
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+#include "AMDGPU.h"
+#include "AMDGPUDemoteSCCBranchToExecz.h"
+
+using namespace llvm;
+
+namespace {
+#define DEBUG_TYPE "amdgpu-demote-scc-to-execz"
+const char PassName[] = "AMDGPU if conversion";
+
+class AMDGPUDemoteSCCBranchToExecz {
+public:
+ AMDGPUDemoteSCCBranchToExecz() = default;
+
+ bool run() { return false; }
+};
+
+class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ AMDGPUDemoteSCCBranchToExecz IfCvt{};
+ return IfCvt.run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return PassName; }
+};
+
+char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
+
+} // namespace
+
+PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run(
+ MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
+ AMDGPUDemoteSCCBranchToExecz IfCvt{};
+ if (!IfCvt.run())
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+char &llvm::AMDGPUDemoteSCCBranchToExeczLegacyID =
+ AMDGPUDemoteSCCBranchToExeczLegacy::ID;
+INITIALIZE_PASS_BEGIN(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName,
+ false, false)
+INITIALIZE_PASS_END(AMDGPUDemoteSCCBranchToExeczLegacy, DEBUG_TYPE, PassName,
+ false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.h b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.h
new file mode 100644
index 00000000000000..3db3b639dd55fe
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.h
@@ -0,0 +1,31 @@
+//===- AMDGPURDemoteSCCBranchToExecz.h --- demote s_cbranch_scc -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Pass used to demote s_cbranch_scc0/1 branches to s_cbranch_execz
+/// branches. These can be later removed by SIPreEmitPeephole.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDEMOTESCCBRANCHTOEXECZ_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDEMOTESCCBRANCHTOEXECZ_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class AMDGPUDemoteSCCBranchToExeczPass
+ : public PassInfoMixin<AMDGPUDemoteSCCBranchToExeczPass> {
+public:
+ AMDGPUDemoteSCCBranchToExeczPass() = default;
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 0ebf34c901c142..d968ac61eea39d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -95,6 +95,7 @@ FUNCTION_PASS_WITH_PARAMS(
#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-demote-scc-to-execz", AMDGPUDemoteSCCBranchToExeczPass())
MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 16e23879cd735c..62caf8db2c81b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -18,6 +18,7 @@
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUCtorDtorLowering.h"
+#include "AMDGPUDemoteSCCBranchToExecz.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
@@ -498,6 +499,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowLegacyPass(*PR);
+ initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
@@ -1336,7 +1338,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
bool GCNPassConfig::addILPOpts() {
if (EnableEarlyIfConversion)
addPass(&EarlyIfConverterID);
-
+ addPass(&AMDGPUDemoteSCCBranchToExeczLegacyID);
TargetPassConfig::addILPOpts();
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fed29c3e14aae2..52bb7db3f8ef9f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUGlobalISelDivergenceLowering.cpp
AMDGPUGlobalISelUtils.cpp
AMDGPUHSAMetadataStreamer.cpp
+ AMDGPUDemoteSCCBranchToExecz.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
AMDGPUInstrInfo.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 646b1264f5deaa..28eb45bbc96c69 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -315,6 +315,7 @@
; GCN-O1-NEXT: Merge disjoint stack slots
; GCN-O1-NEXT: Local Stack Slot Allocation
; GCN-O1-NEXT: Remove dead machine instructions
+; GCN-O1-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion
; GCN-O1-NEXT: MachineDominator Tree Construction
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Machine Block Frequency Analysis
@@ -617,6 +618,7 @@
; GCN-O1-OPTS-NEXT: Merge disjoint stack slots
; GCN-O1-OPTS-NEXT: Local Stack Slot Allocation
; GCN-O1-OPTS-NEXT: Remove dead machine instructions
+; GCN-O1-OPTS-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion
; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis
@@ -932,6 +934,7 @@
; GCN-O2-NEXT: Merge disjoint stack slots
; GCN-O2-NEXT: Local Stack Slot Allocation
; GCN-O2-NEXT: Remove dead machine instructions
+; GCN-O2-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion
; GCN-O2-NEXT: MachineDominator Tree Construction
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Machine Block Frequency Analysis
@@ -1260,6 +1263,7 @@
; GCN-O3-NEXT: Merge disjoint stack slots
; GCN-O3-NEXT: Local Stack Slot Allocation
; GCN-O3-NEXT: Remove dead machine instructions
+; GCN-O3-NEXT: AMDGPU s_cbranch_scc to s_cbranch_execz conversion
; GCN-O3-NEXT: MachineDominator Tree Construction
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Machine Block Frequency Analysis
>From 4626fb4330ad0871bc0335d637dba02422e83f34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Fri, 20 Sep 2024 15:37:38 +0200
Subject: [PATCH 2/2] [AMDGPU][AMDGPUDemoteSCCBranchToExecz] Implementation:
demote s_cbranch_scc branches into vcmp + s_cbranch_execz branches
---
.../AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp | 207 +++++++++++++++++-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 82 +++++++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 18 +-
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 85 +------
.../AMDGPU/amdgpu-demote-scc-branches.ll | 59 +++--
5 files changed, 339 insertions(+), 112 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
index 112de9f7943422..8131ed666f191f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp
@@ -2,18 +2,215 @@
#include "AMDGPU.h"
#include "AMDGPUDemoteSCCBranchToExecz.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
using namespace llvm;
namespace {
#define DEBUG_TYPE "amdgpu-demote-scc-to-execz"
-const char PassName[] = "AMDGPU if conversion";
+const char PassName[] = "AMDGPU s_cbranch_scc to s_cbranch_execz conversion";
+
+std::optional<unsigned> getVALUOpc(const MachineInstr &MI,
+ bool Reverse = false) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+#define HandleOpcAndReverse(Opc, ReverseOpc, VOpc, ReverseVOpc) \
+ case Opc: \
+ return Reverse ? ReverseVOpc : VOpc; \
+ case ReverseOpc: \
+ return Reverse ? VOpc : ReverseVOpc
+ HandleOpcAndReverse(AMDGPU::S_CMP_EQ_I32, AMDGPU::S_CMP_LG_I32,
+ AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_NE_I32_e64);
+ HandleOpcAndReverse(AMDGPU::S_CMP_EQ_U32, AMDGPU::S_CMP_LG_U32,
+ AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_NE_U32_e64);
+ HandleOpcAndReverse(AMDGPU::S_CMP_GT_I32, AMDGPU::S_CMP_LE_I32,
+ AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_LE_I32_e64);
+ HandleOpcAndReverse(AMDGPU::S_CMP_GT_U32, AMDGPU::S_CMP_LE_U32,
+ AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_LE_U32_e64);
+ HandleOpcAndReverse(AMDGPU::S_CMP_GE_I32, AMDGPU::S_CMP_LT_I32,
+ AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_LT_I32_e64);
+ HandleOpcAndReverse(AMDGPU::S_CMP_GE_U32, AMDGPU::S_CMP_LT_U32,
+ AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_LT_U32_e64);
+ HandleOpcAndReverse(AMDGPU::S_CMP_EQ_U64, AMDGPU::S_CMP_LG_U64,
+ AMDGPU::V_CMP_EQ_U64_e64, AMDGPU::V_CMP_NE_U64_e64);
+#undef HandleOpcAndReverse
+ default:
+ break;
+ }
+ return std::nullopt;
+}
+
+bool isSCmpPromotableToVCmp(const MachineInstr &MI) {
+ return getVALUOpc(MI).has_value();
+}
+
+bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then,
+ MachineBasicBlock *&Tail) {
+ if (Head.succ_size() != 2)
+ return false;
+
+ Then = Head.succ_begin()[0];
+ Tail = Head.succ_begin()[1];
+
+ // Canonicalize so Succ0 has MBB as its single predecessor.
+ if (Then->pred_size() != 1)
+ std::swap(Then, Tail);
+
+ if (Then->pred_size() != 1 || Then->succ_size() != 1)
+ return false;
+
+ return *Then->succ_begin() == Tail;
+}
+
+bool hasPromotableCmpConditon(MachineInstr &Term, MachineInstr *&Cmp) {
+ auto CmpIt = std::next(Term.getReverseIterator());
+ if (CmpIt == Term.getParent()->instr_rend())
+ return false;
+
+ if (!isSCmpPromotableToVCmp(*CmpIt))
+ return false;
+
+ Cmp = &*CmpIt;
+ return true;
+}
+
+bool hasCbranchSCCTerm(MachineBasicBlock &Head, MachineInstr *&Term) {
+ auto TermIt = Head.getFirstInstrTerminator();
+ if (TermIt == Head.end())
+ return false;
+
+ switch (TermIt->getOpcode()) {
+ case AMDGPU::S_CBRANCH_SCC0:
+ case AMDGPU::S_CBRANCH_SCC1:
+ Term = &*TermIt;
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isTriangularSCCBranch(MachineBasicBlock &Head, MachineInstr *&Term,
+ MachineInstr *&Cmp, MachineBasicBlock *&Then,
+ MachineBasicBlock *&Tail) {
+
+ if (!hasCbranchSCCTerm(Head, Term))
+ return false;
+
+ bool SCCIsUsedOutsideHead = any_of(
+ Head.liveouts(), [](const auto &P) { return P.PhysReg == AMDGPU::SCC; });
+ if (SCCIsUsedOutsideHead)
+ return false;
+
+ if (!isTriangular(Head, Then, Tail))
+ return false;
+
+ // phi-nodes in the tail can prevent splicing the instructions of the then
+ // and tail blocks in the head
+ if (!Tail->empty() && Tail->begin()->isPHI())
+ return false;
+
+ if (!hasPromotableCmpConditon(*Term, Cmp))
+ return false;
+
+ return true;
+}
+
+bool SCC1JumpsToThen(const MachineInstr &Term, const MachineBasicBlock &Then) {
+ MachineBasicBlock *TBB = Term.getOperand(0).getMBB();
+ return (TBB == &Then) == (Term.getOpcode() == AMDGPU::S_CBRANCH_SCC1);
+}
class AMDGPUDemoteSCCBranchToExecz {
+ MachineFunction &MF;
+ const GCNSubtarget &ST;
+ const SIInstrInfo &TII;
+ const SIRegisterInfo &RegInfo;
+ const TargetSchedModel &SchedModel;
+
public:
- AMDGPUDemoteSCCBranchToExecz() = default;
+ AMDGPUDemoteSCCBranchToExecz(MachineFunction &MF)
+ : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ RegInfo(*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
+
+ bool mustRetainSCCBranch(const MachineInstr &Term, const MachineInstr &Cmp,
+ const MachineBasicBlock &Then,
+ const MachineBasicBlock &Tail) {
+ bool IsWave32 = TII.isWave32();
+ unsigned AndSaveExecOpc =
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned NewOps[] = {*getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)),
+ AndSaveExecOpc, Mov};
+ unsigned NewOpsCost = 0;
+ for (unsigned Opc : NewOps)
+ NewOpsCost += SchedModel.computeInstrLatency(Opc);
+ unsigned OldCmpCost = SchedModel.computeInstrLatency(&Cmp, false);
+
+ assert(NewOpsCost >= OldCmpCost);
+ return !TII.mustRetainExeczBranch(Term, Then, Tail,
+ NewOpsCost - OldCmpCost);
+ }
+
+ void demoteCmp(MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
+ MachineBasicBlock &Then, MachineBasicBlock &Tail) {
+ unsigned NewCmpOpc = *getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then));
+ Cmp.setDesc(TII.get(NewCmpOpc));
+
+ Cmp.removeOperand(2);
+
+ auto VCC = RegInfo.getVCC();
+ auto Exec = RegInfo.getExec();
- bool run() { return false; }
+ auto &MRI = MF.getRegInfo();
+ MCRegister ExecBackup =
+ MRI.createVirtualRegister(RegInfo.getPhysRegBaseClass(Exec));
+
+ Cmp.insert(Cmp.operands_begin(), MachineOperand::CreateReg(VCC, true));
+ Cmp.addImplicitDefUseOperands(MF);
+
+ TII.legalizeOperands(Cmp);
+
+ bool IsWave32 = TII.isWave32();
+ unsigned AndSaveExecOpc =
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ auto SaveAndMaskExec = BuildMI(*Term.getParent(), Term, Cmp.getDebugLoc(),
+ TII.get(AndSaveExecOpc), ExecBackup);
+ SaveAndMaskExec.addReg(VCC, RegState::Kill);
+ SaveAndMaskExec->getOperand(3).setIsDead(); // mark SCC as dead
+
+ DebugLoc DL = Term.getDebugLoc();
+ TII.removeBranch(Head);
+ MachineOperand Cond[] = {
+ MachineOperand::CreateImm(SIInstrInfo::BranchPredicate::EXECZ),
+ MachineOperand::CreateReg(RegInfo.getExec(), false)};
+ TII.insertBranch(Head, &Tail, &Then, Cond, DL);
+
+ TII.restoreExec(MF, Tail, Tail.instr_begin(), DebugLoc(), ExecBackup);
+ }
+
+ bool run() {
+ if (!SchedModel.hasInstrSchedModel())
+ return false;
+ bool Changed = false;
+
+ for (MachineBasicBlock &Head : MF) {
+ MachineInstr *Term;
+ MachineInstr *Cmp;
+ MachineBasicBlock *Then;
+ MachineBasicBlock *Tail;
+ if (!isTriangularSCCBranch(Head, Term, Cmp, Then, Tail))
+ continue;
+
+ if (!mustRetainSCCBranch(*Term, *Cmp, *Then, *Tail))
+ continue;
+
+ demoteCmp(*Term, *Cmp, Head, *Then, *Tail);
+ Changed = true;
+ }
+ return Changed;
+ }
};
class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
@@ -23,7 +220,7 @@ class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
- AMDGPUDemoteSCCBranchToExecz IfCvt{};
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
return IfCvt.run();
}
@@ -40,7 +237,7 @@ char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run(
MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
- AMDGPUDemoteSCCBranchToExecz IfCvt{};
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
if (!IfCvt.run())
return PreservedAnalyses::all();
return PreservedAnalyses::none();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d676d561d08180..e957d737a9814e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4118,6 +4118,88 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
}
+namespace {
+class BranchWeightCostModel {
+ const SIInstrInfo &TII;
+ const TargetSchedModel &SchedModel;
+ BranchProbability BranchProb;
+ static constexpr uint64_t BranchNotTakenCost = 1;
+ uint64_t BranchTakenCost;
+ uint64_t ThenCyclesCost;
+
+public:
+ BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
+ const MachineBasicBlock &Succ,
+ unsigned ExtraTransformationCosts)
+ : TII(TII), SchedModel(TII.getSchedModel()),
+ ThenCyclesCost(ExtraTransformationCosts) {
+ const MachineBasicBlock &Head = *Branch.getParent();
+ const auto *FromIt = find(Head.successors(), &Succ);
+ assert(FromIt != Head.succ_end());
+
+ BranchProb = Head.getSuccProbability(FromIt);
+ if (BranchProb.isUnknown())
+ BranchProb = BranchProbability::getZero();
+ BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
+ }
+
+ bool isProfitable(const MachineInstr &MI) {
+ if (TII.isWaitcnt(MI.getOpcode()))
+ return false;
+
+ ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
+
+ // Consider `P = N/D` to be the probability of execz being false (skipping
+ // the then-block) The transformation is profitable if always executing the
+ // 'then' block is cheaper than executing sometimes 'then' and always
+ // executing s_cbranch_execz:
+ // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
+ // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
+ // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
+ // BranchNotTakenCost
+ uint64_t Numerator = BranchProb.getNumerator();
+ uint64_t Denominator = BranchProb.getDenominator();
+ return (Denominator - Numerator) * ThenCyclesCost <=
+ ((Denominator - Numerator) * BranchTakenCost +
+ Numerator * BranchNotTakenCost);
+ }
+};
+} // namespace
+
+bool SIInstrInfo::mustRetainExeczBranch(
+ const MachineInstr &Branch, const MachineBasicBlock &From,
+ const MachineBasicBlock &To, unsigned ExtraTransformationCosts) const {
+
+ assert(is_contained(Branch.getParent()->successors(), &From));
+ BranchWeightCostModel CostModel{*this, Branch, From,
+ ExtraTransformationCosts};
+
+ const MachineFunction *MF = From.getParent();
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (const MachineInstr &MI : MBB) {
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might never be taken when EXEC = 0.
+ // Hence we should retain cbranch out of the loop lest it become infinite.
+ if (MI.isConditionalBranch())
+ return true;
+
+ if (MI.isMetaInstruction())
+ continue;
+
+ if (hasUnwantedEffectsWhenEXECEmpty(MI))
+ return true;
+
+ if (!CostModel.isProfitable(MI))
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 7041b59964645a..863b6612973238 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -87,6 +87,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
TargetSchedModel SchedModel;
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
+public:
// The inverse predicate should have the negative value.
enum BranchPredicate {
INVALID_BR = 0,
@@ -98,6 +99,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
EXECZ = 3
};
+private:
using SetVectorType = SmallSetVector<MachineInstr *, 32>;
static unsigned getBranchOpcode(BranchPredicate Cond);
@@ -1031,13 +1033,21 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// Return true if the instruction modifies the mode register.q
static bool modifiesModeRegister(const MachineInstr &MI);
+ /// Returns true if it's protifable to remove an execz branch from Branch to
+ /// From
+ bool mustRetainExeczBranch(const MachineInstr &Branch,
+ const MachineBasicBlock &From,
+ const MachineBasicBlock &To,
+ unsigned ExtraTransformationCosts = 0) const;
+
/// This function is used to determine if an instruction can be safely
/// executed under EXEC = 0 without hardware error, indeterminate results,
/// and/or visible effects on future vector execution or outside the shader.
- /// Note: as of 2024 the only use of this is SIPreEmitPeephole where it is
- /// used in removing branches over short EXEC = 0 sequences.
- /// As such it embeds certain assumptions which may not apply to every case
- /// of EXEC = 0 execution.
+ /// Note: as of 2024 the only use of this is SIPreEmitPeephole and
+ /// AMDGPUDemoteSCCBranchToExecz (through SIIInstrInfo::mustRetainExeczBranch)
+ /// where it is used in removing branches over short EXEC = 0 sequences. As
+ /// such it embeds certain assumptions which may not apply to every case of
+ /// EXEC = 0 execution.
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
/// Returns true if the instruction could potentially depend on the value of
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 701084844cd9b4..1c8beca0865369 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -15,8 +15,6 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/Support/BranchProbability.h"
using namespace llvm;
@@ -35,9 +33,6 @@ class SIPreEmitPeephole : public MachineFunctionPass {
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
- bool mustRetainExeczBranch(const MachineInstr &Branch,
- const MachineBasicBlock &From,
- const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
public:
@@ -299,84 +294,6 @@ bool SIPreEmitPeephole::getBlockDestinations(
return true;
}
-namespace {
-class BranchWeightCostModel {
- const SIInstrInfo &TII;
- const TargetSchedModel &SchedModel;
- BranchProbability BranchProb;
- static constexpr uint64_t BranchNotTakenCost = 1;
- uint64_t BranchTakenCost;
- uint64_t ThenCyclesCost = 0;
-
-public:
- BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
- const MachineBasicBlock &Succ)
- : TII(TII), SchedModel(TII.getSchedModel()) {
- const MachineBasicBlock &Head = *Branch.getParent();
- const auto *FromIt = find(Head.successors(), &Succ);
- assert(FromIt != Head.succ_end());
-
- BranchProb = Head.getSuccProbability(FromIt);
- if (BranchProb.isUnknown())
- BranchProb = BranchProbability::getZero();
- BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
- }
-
- bool isProfitable(const MachineInstr &MI) {
- if (TII.isWaitcnt(MI.getOpcode()))
- return false;
-
- ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
-
- // Consider `P = N/D` to be the probability of execz being false (skipping
- // the then-block) The transformation is profitable if always executing the
- // 'then' block is cheaper than executing sometimes 'then' and always
- // executing s_cbranch_execz:
- // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
- // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
- // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
- // BranchNotTakenCost
- uint64_t Numerator = BranchProb.getNumerator();
- uint64_t Denominator = BranchProb.getDenominator();
- return (Denominator - Numerator) * ThenCyclesCost <=
- ((Denominator - Numerator) * BranchTakenCost +
- Numerator * BranchNotTakenCost);
- }
-};
-
-bool SIPreEmitPeephole::mustRetainExeczBranch(
- const MachineInstr &Branch, const MachineBasicBlock &From,
- const MachineBasicBlock &To) const {
- assert(is_contained(Branch.getParent()->successors(), &From));
- BranchWeightCostModel CostModel{*TII, Branch, From};
-
- const MachineFunction *MF = From.getParent();
- for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- const MachineBasicBlock &MBB = *MBBI;
-
- for (const MachineInstr &MI : MBB) {
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might never be taken when EXEC = 0.
- // Hence we should retain cbranch out of the loop lest it become infinite.
- if (MI.isConditionalBranch())
- return true;
-
- if (MI.isMetaInstruction())
- continue;
-
- if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
- return true;
-
- if (!CostModel.isProfitable(MI))
- return true;
- }
- }
-
- return false;
-}
-} // namespace
-
// Returns true if the skip branch instruction is removed.
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {
@@ -396,7 +313,7 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return false;
// Consider only when it is legal and profitable
- if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
+ if (TII->mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
return false;
LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
index aa38f43368694d..a305762cd4a55b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
@@ -101,8 +101,8 @@ define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) no
; GFX9-LABEL: uniform_br_profitable:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lt_i32 s21, 1
-; GFX9-NEXT: s_cbranch_scc1 .LBB2_2
+; GFX9-NEXT: v_cmp_ge_i32_e64 vcc, s21, 1
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if.then
; GFX9-NEXT: s_mov_b32 s11, s18
; GFX9-NEXT: s_mov_b32 s10, s17
@@ -111,26 +111,47 @@ define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) no
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s19
; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX9-NEXT: .LBB2_2: ; %if.end
+; GFX9-NEXT: ; %bb.2: ; %if.end
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: uniform_br_profitable:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lt_i32 s21, 1
-; GFX10-NEXT: s_cbranch_scc1 .LBB2_2
-; GFX10-NEXT: ; %bb.1: ; %if.then
-; GFX10-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NEXT: v_mov_b32_e32 v1, s19
-; GFX10-NEXT: s_mov_b32 s11, s18
-; GFX10-NEXT: s_mov_b32 s10, s17
-; GFX10-NEXT: s_mov_b32 s9, s16
-; GFX10-NEXT: s_mov_b32 s8, s7
-; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
-; GFX10-NEXT: .LBB2_2: ; %if.end
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX1010-LABEL: uniform_br_profitable:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_cmp_ge_i32_e64 vcc_lo, s21, 1
+; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT: ; %bb.1: ; %if.then
+; GFX1010-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-NEXT: v_mov_b32_e32 v1, s19
+; GFX1010-NEXT: s_mov_b32 s11, s18
+; GFX1010-NEXT: s_mov_b32 s10, s17
+; GFX1010-NEXT: s_mov_b32 s9, s16
+; GFX1010-NEXT: s_mov_b32 s8, s7
+; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT: ; %bb.2: ; %if.end
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT: s_mov_b32 exec_lo, s4
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: uniform_br_profitable:
+; GFX1030: ; %bb.0: ; %entry
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: s_mov_b32 s4, exec_lo
+; GFX1030-NEXT: v_cmpx_ge_i32_e64 s21, 1
+; GFX1030-NEXT: ; %bb.1: ; %if.then
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s19
+; GFX1030-NEXT: s_mov_b32 s11, s18
+; GFX1030-NEXT: s_mov_b32 s10, s17
+; GFX1030-NEXT: s_mov_b32 s9, s16
+; GFX1030-NEXT: s_mov_b32 s8, s7
+; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT: ; %bb.2: ; %if.end
+; GFX1030-NEXT: s_mov_b32 exec_lo, s4
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp = icmp sgt i32 %flag, 0
br i1 %cmp, label %if.then, label %if.end, !prof !1
More information about the llvm-commits
mailing list