[llvm] [AMDGPU] Added hot-block-rematerialize pass (PR #126331)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 7 17:31:06 PST 2025
github-actions[bot] wrote:
<!--LLVM CODE FORMAT COMMENT: {clang-format}-->
:warning: C/C++ code formatter, clang-format found issues in your code. :warning:
<details>
<summary>
You can test this locally with the following command:
</summary>
``````````bash
git-clang-format --diff 2eb44aa0a94a8d4230c1c9a0c306af16bfc92925 a13cfc4dcc49c810182bf5ca2bd3b3f0a40c75cd --extensions cpp,h -- llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h llvm/include/llvm/CodeGen/TargetRegisterInfo.h llvm/lib/CodeGen/TargetRegisterInfo.cpp llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/lib/Target/AMDGPU/GCNRegPressure.h llvm/lib/Target/AMDGPU/SIInstrInfo.h
``````````
</details>
<details>
<summary>
View the diff from clang-format here.
</summary>
``````````diff
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index d37796a828..d21fb9e0dd 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -796,9 +796,10 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
}
if (BestIdx == 0) {
- LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
- << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
- << '\n');
+ LLVM_DEBUG(
+ dbgs() << "Unable to find minimal spanning sub register(s) for "
+ << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+ << '\n');
assert(false && "Impossible to span reg class");
return std::vector<unsigned>();
}
@@ -809,4 +810,3 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
return result;
}
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 8647185bf5..afb4e0bafa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -1,4 +1,5 @@
-//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block Rematerialize-------===//
+//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block
+//Rematerialize-------===//
//
// The LLVM Compiler Infrastructure
//
@@ -13,24 +14,24 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPUMIRUtils.h"
#include "AMDGPUMirDivergenceAnalysis.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
#include "AMDGPUSubExpDag.h"
+#include "AMDGPUSubtarget.h"
#include "AMDGPUVMemDegreeDAG.h"
-#include "AMDGPUOccupancyAndLatencyHelper.h"
#include "GCNRegPressure.h"
#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
#include "SIMachineFunctionInfo.h"
-#include "AMDGPUMIRUtils.h"
+#include "SIRegisterInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/SlotIndexes.h"
@@ -40,20 +41,24 @@
using namespace llvm;
static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
-static cl::opt<bool> EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
-static cl::opt<bool> EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
-static cl::opt<bool> EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
+static cl::opt<bool>
+ EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+static cl::opt<bool>
+ EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
+static cl::opt<bool>
+ EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
static cl::opt<bool> EnableVmemDegree("amdgpu-remat-enable-vmem-degree");
static cl::opt<bool> EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat");
static cl::opt<bool> EnableSubExp("amdgpu-remat-enable-sub-exp-remat");
-static cl::opt<bool> EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
-static cl::opt<bool> EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
+static cl::opt<bool>
+ EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
+static cl::opt<bool>
+ EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
namespace {
typedef DenseSet<MachineInstr *> InstSet;
typedef DenseSet<MachineBasicBlock *> BlockSet;
-template<typename T>
-using BlockMap = MapVector<MachineBasicBlock *, T>;
+template <typename T> using BlockMap = MapVector<MachineBasicBlock *, T>;
// Rematerialize in a single pass instead of doing in register allcation.
// If in register allocation, fail to rematerialize will cause spill.
@@ -62,9 +67,9 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
public:
static char ID;
- DenseSet<const MachineInstr*> TotalUniformInsts;
- DenseSet<const MachineInstr*> SafeToRemoveInsts;
- DenseSet<const MachineInstr*> DivergentInsts;
+ DenseSet<const MachineInstr *> TotalUniformInsts;
+ DenseSet<const MachineInstr *> SafeToRemoveInsts;
+ DenseSet<const MachineInstr *> DivergentInsts;
void RemoveInst(const MachineInstr *MI) {
TotalUniformInsts.erase(MI);
SafeToRemoveInsts.erase(MI);
@@ -96,9 +101,8 @@ typedef AMDGPUHotBlockRematerialize Remat;
// Util functions.
namespace {
-MachineBasicBlock *
-nearest_common_dominator(MachineDominatorTree *DT,
- BlockSet &Blocks) {
+MachineBasicBlock *nearest_common_dominator(MachineDominatorTree *DT,
+ BlockSet &Blocks) {
auto I = Blocks.begin(), E = Blocks.end();
MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
@@ -214,10 +218,10 @@ bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
return true;
}
-
// SGPR has alignment requirment, cannot get accurate reg number.
const unsigned NearTargetRegLimit = 10;
-bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, MachineFunction &MF) {
+bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST,
+ MachineFunction &MF) {
unsigned maxSGPR = ST->getAddressableNumSGPRs();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
@@ -251,10 +255,10 @@ struct RematStatus {
DenseSet<MachineBasicBlock *> MemWriteMBBSet;
};
-unsigned CollectMBBPressure(
- MachineBasicBlock &MBB, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
- const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
- RematStatus &status) {
+unsigned CollectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST, unsigned &maxVPressure,
+ unsigned &maxSPressure, RematStatus &status) {
// Skip processing current block if it has only debug instructions
if (MBB.getFirstNonDebugInstr() == MBB.end())
return ST->getOccupancyWithNumVGPRs(0);
@@ -287,10 +291,10 @@ unsigned CollectMBBPressure(
return RP.getOccupancy(*ST);
}
-unsigned CollectFnPressure(
- MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
- const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
- RematStatus &status) {
+unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST, unsigned &maxVPressure,
+ unsigned &maxSPressure, RematStatus &status) {
unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
// If only have one block, input/ouput virtual live set are empty.
if (MF.size() > 1) {
@@ -349,15 +353,13 @@ unsigned CollectFnPressure(
LLVM_DEBUG(
const SIRegisterInfo *SIRI = ST->getRegisterInfo();
- dbgs() << "output live"; for (auto &it
- : status.MBBOutputLiveMap) {
+ dbgs() << "output live"; for (auto &it : status.MBBOutputLiveMap) {
unsigned Idx = it.first->getNumber();
auto LiveReg = it.second;
dbgs() << "MBB" << Idx << ":";
llvm::dumpLiveSet(LiveReg, SIRI);
} dbgs() << "input live";
- for (auto &it
- : status.MBBInputLiveMap) {
+ for (auto &it : status.MBBInputLiveMap) {
unsigned Idx = it.first->getNumber();
auto LiveReg = it.second;
dbgs() << "MBB" << Idx << ":";
@@ -373,14 +375,14 @@ unsigned CollectFnPressure(
}
return TgtOcc;
}
-RematStatus
-GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
- const MachineRegisterInfo &MRI, const GCNSubtarget *ST) {
+RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST) {
unsigned maxSPressure = 0;
unsigned maxVPressure = 0;
RematStatus status;
- unsigned TgtOcc = CollectFnPressure(MF, LIS, MRI, ST, maxVPressure,
- maxSPressure, status);
+ unsigned TgtOcc =
+ CollectFnPressure(MF, LIS, MRI, ST, maxVPressure, maxSPressure, status);
const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
if (TgtOcc >= MaxOcc) {
status.TargetOcc = TgtOcc;
@@ -415,7 +417,7 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
TgtOcc = bigOcc;
bNotBalance = true;
if (TgtOcc >= MaxOccupancy)
- TgtOcc = MaxOccupancy-1;
+ TgtOcc = MaxOccupancy - 1;
}
}
@@ -433,7 +435,7 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
vInputPressure += RegSize;
} else {
unsigned RegIndex = SIRI->getHWRegIndex(Reg);
- uint64_t mask = ((1 << RegSize) - 1 ) << RegIndex;
+ uint64_t mask = ((1 << RegSize) - 1) << RegIndex;
sInputMask |= mask;
}
}
@@ -448,7 +450,6 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
mask = mask << 4;
}
-
// If balanced, try next occupancy.
TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1);
@@ -611,8 +612,7 @@ int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
}
int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
- GCNRPTracker::LiveRegSet &CanidateSet,
- InstSet &ReducedInsts,
+ GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts,
const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
BlockLiveInfo &LiveInfo,
DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
@@ -788,9 +788,11 @@ void BuildRematCandiates(std::vector<RematNode> &Candidates,
}
// For case like
-// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, implicit-def dead $scc; xb.uniform
-// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; xb.uniform
-// %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit killed $scc; xb.uniform
+// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0,
+// implicit-def dead $scc; xb.uniform
+// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc;
+// xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit
+// killed $scc; xb.uniform
// Sink S_AND right before S_CSELECT will overwrite SCC.
// To avoid it, skip case when DefMI and UseMI has implicit define use.
bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
@@ -970,7 +972,7 @@ int FilterRematCandiates(std::vector<RematNode> &Candidates,
}
void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
- SmallVector<MachineInstr *, 2> &userMIs) {
+ SmallVector<MachineInstr *, 2> &userMIs) {
for (MachineInstr *UseMI : userMIs) {
for (MachineOperand &MO : UseMI->operands()) {
if (!MO.isReg())
@@ -996,7 +998,6 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
}
}
-
// For userBlocks which dominate all hotBlocks, don't need to clone because
// the value not cross hotBlocks when later blocks are cloned.
// For userBlocks which dominated by all hotBlocks, they could share clones
@@ -1061,68 +1062,45 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
// Look for an earlier insert point if the InstructionToMove
// writes to scc and scc is live at the CurrentInsertPoint.
static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash(
- MachineInstr *InstructionToMove,
- MachineBasicBlock *MBB,
- MachineBasicBlock::iterator CurrentInsertPoint,
- MachineRegisterInfo &MRI,
- const SIRegisterInfo *SIRI,
- const SIInstrInfo *SIII
-)
-{
- const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
- if (WillSmashScc)
- {
- CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
- CurrentInsertPoint,
- SIRI,
- SIII,
- &MRI
- );
- }
-
- return CurrentInsertPoint;
+ MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ const bool WillSmashScc =
+ InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+ if (WillSmashScc) {
+ CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(
+ MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+ }
+
+ return CurrentInsertPoint;
}
// Look for an earlier insert point if the SubExp
// writes to scc and scc is live at the CurrentInsertPoint.
static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash(
- const SubExp &SubExpToMove,
- MachineBasicBlock *MBB,
- MachineBasicBlock::iterator CurrentInsertPoint,
- MachineRegisterInfo& MRI,
- const SIRegisterInfo* SIRI,
- const SIInstrInfo* SIII
-)
-{
- const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
- if (WillSmashScc)
- {
- CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
- CurrentInsertPoint,
- SIRI,
- SIII,
- &MRI
- );
- }
-
- return CurrentInsertPoint;
+ const SubExp &SubExpToMove, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
+ if (WillSmashScc) {
+ CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(
+ MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+ }
+
+ return CurrentInsertPoint;
}
// Return trun if moving MI to Location will smash a live scc value.
-static bool WillSmashSccAtLocation(
- MachineInstr* MI,
- MachineBasicBlock* MBB,
- MachineBasicBlock::iterator Location
-)
-{
- // It is ok to pass nullptr to `modifiesRegister` for TRI here since
- // SCC has no subreg/suprereg relationships.
- return MI->modifiesRegister(AMDGPU::SCC, nullptr)
- && llvm::IsSccLiveAt(MBB, Location);
+static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator Location) {
+ // It is ok to pass nullptr to `modifiesRegister` for TRI here since
+ // SCC has no subreg/suprereg relationships.
+ return MI->modifiesRegister(AMDGPU::SCC, nullptr) &&
+ llvm::IsSccLiveAt(MBB, Location);
}
-void ApplyCloneRemat(Remat *Remat,
- RematNode &Node, std::vector<BlockLiveInfo> &hotBlocks,
+void ApplyCloneRemat(Remat *Remat, RematNode &Node,
+ std::vector<BlockLiveInfo> &hotBlocks,
MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
const SIInstrInfo *SIII, MachineFunction &MF) {
@@ -1182,10 +1160,9 @@ void ApplyCloneRemat(Remat *Remat,
InsertPointMI = UseMI;
}
}
-
+
MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash(
- DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII
- );
+ DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII);
for (MachineMemOperand *MO : DefMI->memoperands()) {
NewDef->addMemOperand(MF, MO);
@@ -1218,10 +1195,11 @@ void ApplyCloneRemat(Remat *Remat,
void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
SlotIndexes *slotIndexes,
- const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII) {
MachineInstr *DefMI = Node.DefMI;
MachineInstr *InsertPointMI = Node.InsertPointMI;
- MachineBasicBlock* MBB = nullptr;
+ MachineBasicBlock *MBB = nullptr;
// Find a valid insert point.
MachineBasicBlock::iterator InsertPoint;
@@ -1233,10 +1211,9 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
MBB = Node.InsertBlock;
}
- InsertPoint = AdjustInsertPointToAvoidSccSmash(
- DefMI, MBB, InsertPoint, MRI, SIRI, SIII
- );
-
+ InsertPoint = AdjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
+ SIRI, SIII);
+
// Move instruction to new location.
DefMI->removeFromParent();
InsertPoint->getParent()->insert(InsertPoint, DefMI);
@@ -1268,7 +1245,8 @@ void ApplyRemat(Remat *Remat, MapVector<unsigned, RematNode> &RematMap,
if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII);
} else if (Node.Kind == RematNode::RematKind::Clone) {
- ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, MF);
+ ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII,
+ MF);
}
}
}
@@ -1502,7 +1480,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
if (UseMI.getParent() != MBB)
continue;
- int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, /*bVGPR*/false);
+ int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI,
+ /*bVGPR*/ false);
if (gain > 0) {
// Skip case when DefMI has implicit define which used by UseMI.
if (isImplicitDefUse(&MI, &UseMI)) {
@@ -1536,8 +1515,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
bool bNeedVRemat = rematVCnt > 0;
// If sgpr spill, always do remat.
bool bSRematOK =
- (newRematSCnt <= 0 && !SRematMap.empty()) ||
- bForceRematSgpr;
+ (newRematSCnt <= 0 && !SRematMap.empty()) || bForceRematSgpr;
bool bVRematOK =
(status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty();
if (bNeedSRemat && bNeedVRemat) {
@@ -1572,7 +1550,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
if (!SRematMap.empty()) {
bUpdated = true;
- ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, MF);
+ ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII,
+ MF);
LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
}
@@ -1592,49 +1571,46 @@ bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) {
return DefMIs.size() == 1;
}
-static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg)
-{
- if (!MO.isImplicit() || !MO.isUse() || !MO.isReg())
- {
- return false;
- }
+static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) {
+ if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) {
+ return false;
+ }
- return MO.getReg() == Reg;
+ return MO.getReg() == Reg;
}
-static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg)
-{
- if (!MO.isImplicit() || !MO.isDef() || !MO.isReg())
- {
- return false;
- }
+static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg) {
+ if (!MO.isImplicit() || !MO.isDef() || !MO.isReg()) {
+ return false;
+ }
- return MO.getReg() == Reg;
+ return MO.getReg() == Reg;
}
-static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII)
-{
- // Make sure UseMI is not wqm like sample.
- if (SIII->isWQM(UseMI->getOpcode()))
- return false;
- if (UseMI->getOpcode() == AMDGPU::PHI)
- return false;
-
- return true;
+static bool IsSafeRematCandidateUser(const MachineInstr *UseMI,
+ const SIInstrInfo *SIII) {
+ // Make sure UseMI is not wqm like sample.
+ if (SIII->isWQM(UseMI->getOpcode()))
+ return false;
+ if (UseMI->getOpcode() == AMDGPU::PHI)
+ return false;
+
+ return true;
}
static bool isConvergent(Remat *Remat, const MachineInstr &MI) {
return MI.isConvergent() &&
- // This flag is set on readfirstlane's to indicate that they
- // are redundant (the value being read is already uniform).
- // Normally, readfirstlanes are convergent, because different exec
- // will cause a different value to be read; a known uniform
- // readfirstlane is safe to move or clone and not actually convergent.
- !Remat->TotalUniformInsts.count(&MI);
+ // This flag is set on readfirstlane's to indicate that they
+ // are redundant (the value being read is already uniform).
+ // Normally, readfirstlanes are convergent, because different exec
+ // will cause a different value to be read; a known uniform
+ // readfirstlane is safe to move or clone and not actually convergent.
+ !Remat->TotalUniformInsts.count(&MI);
}
bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
- const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, bool bSink) {
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ bool bSink) {
if (Reg.isPhysical())
return false;
bool bVGPR = SIRI->isVGPR(MRI, Reg);
@@ -1661,7 +1637,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
if (!Op.isReg())
continue;
Register OpReg = Op.getReg();
- if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
+ if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) ||
+ IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
continue;
if (IsImplicitUseOfReg(Op, AMDGPU::MODE))
continue;
@@ -1672,7 +1649,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
continue;
if (OpReg.isPhysical())
return false;
- if (!MRI.getUniqueVRegDef(OpReg) && !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
+ if (!MRI.getUniqueVRegDef(OpReg) &&
+ !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
return false;
}
}
@@ -1693,12 +1671,10 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
}
std::vector<SubExp> buildSubExpFromCandidates(
- Remat *Remat,
- GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+ Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes,
- GCNRPTracker::LiveRegSet &unUsedPassThrus,
- bool bAllowPartialUseInSubExp) {
+ GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
InstSet CandidateDefs;
DenseSet<unsigned> RemovedCandidates;
std::vector<unsigned> CandidateRegs;
@@ -1795,7 +1771,7 @@ std::vector<SubExp> buildSubExpFromCandidates(
break;
}
- if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/true))
+ if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true))
continue;
// If all users of MI are in candidate defs, add MI into candidate defs.
@@ -1852,10 +1828,9 @@ std::vector<SubExp> buildSubExpFromCandidates(
defs.emplace_back(pMI);
}
- LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
- : defs) {
- MI->dump();
- } dbgs() << "\nFinished Candidate Defs End\n";);
+ LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n";
+ for (MachineInstr *MI : defs) { MI->dump(); } dbgs()
+ << "\nFinished Candidate Defs End\n";);
// Build SubExp with CandidateDefs as Nodes, CandidateInput as input
// Candidates as output.
@@ -1874,10 +1849,8 @@ std::vector<SubExp> buildSubExpFromCandidates(
return dag.SubExps;
}
-
std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
- Remat* Remat,
- GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+ Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) {
InstSet CandidateDefs;
@@ -2043,13 +2016,11 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
defs.emplace_back(pMI);
}
- LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
- : defs) {
- MI->dump();
- } dbgs() << "\nFinished Candidate Defs End\n";);
+ LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n";
+ for (MachineInstr *MI : defs) { MI->dump(); } dbgs()
+ << "\nFinished Candidate Defs End\n";);
- LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it
- : LocalCandidates) {
+ LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it : LocalCandidates) {
pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs());
} dbgs() << "\nLocalCandidates End\n";);
// Make sure all input reg are uniqueDef.
@@ -2061,7 +2032,6 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
return dag.SubExps;
}
-
void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
if (Reg.isVirtual()) {
StringRef Name = MRI.getVRegName(Reg);
@@ -2099,8 +2069,7 @@ MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
MachineDominatorTree *pDT,
- SlotIndexes *slotIndexes,
- const SIInstrInfo *SIII,
+ SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
const SIRegisterInfo *SIRI) {
// Move from bottom.
MachineBasicBlock *FromBB = Exp.FromBB;
@@ -2115,12 +2084,14 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
continue;
// Do not overwrite a live scc.
- MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin());
+ MachineBasicBlock::iterator InsertPoint =
+ ToBB->SkipPHIsAndLabels(ToBB->begin());
if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint))
continue;
DefMI->removeFromParent();
- assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && "invalid insert point");
+ assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) &&
+ "invalid insert point");
ToBB->insert(InsertPoint, DefMI);
// Debug insts don't need slot index.
if (DefMI->isDebugInstr())
@@ -2131,12 +2102,11 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
}
}
-
void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
- MachineDominatorTree *pDT,
- SlotIndexes *slotIndexes,
- const SIInstrInfo *SIII,
- const SIRegisterInfo *SIRI) {
+ MachineDominatorTree *pDT,
+ SlotIndexes *slotIndexes,
+ const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI) {
// Move from top.
// Find lowest input def.
MachineBasicBlock *ToBB = Exp.ToBB;
@@ -2152,9 +2122,8 @@ void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
Terminator = ToBB->end();
}
- Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(
- Exp, ToBB, Terminator, MRI, SIRI, SIII
- );
+ Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator,
+ MRI, SIRI, SIII);
for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
MachineInstr *DefMI = *it;
@@ -2388,11 +2357,12 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT);
// Sort to make stable order.
- std::sort(userBlocks.begin(), userBlocks.end(),
- [](std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it0,
- std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it1) {
+ std::sort(
+ userBlocks.begin(), userBlocks.end(),
+ [](std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &it0,
+ std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &it1) {
return it0.first->getNumber() < it1.first->getNumber();
- });
+ });
const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
@@ -2481,7 +2451,6 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
}
}
-
void ApplySubExpCloneNearUserInBlock(
SubExp &Exp,
DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
@@ -2620,7 +2589,7 @@ unsigned getPacifistLevel(unsigned Reg,
}
bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI) {
for (MachineInstr &def : MRI.def_instructions(Reg)) {
if (def.getParent() != MBB)
continue;
@@ -2655,8 +2624,8 @@ bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive,
return inputLive.count(Reg) && outputLive.count(Reg);
}
-// Instructions which only use imm/passThru reg/output only reg will not kill any
-// live reg, so name them pacifist here.
+// Instructions which only use imm/passThru reg/output only reg will not kill
+// any live reg, so name them pacifist here.
bool collectPacifist(MachineInstr &MI,
const GCNRPTracker::LiveRegSet &inputLive,
const GCNRPTracker::LiveRegSet &outputLive,
@@ -2699,7 +2668,8 @@ bool collectPacifist(MachineInstr &MI,
if (Reg.isPhysical())
return false;
- if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+ if (nullptr ==
+ getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
return false;
bHasDef = true;
@@ -2708,30 +2678,27 @@ bool collectPacifist(MachineInstr &MI,
return bHasDef;
}
-static MachineInstr* findFirstAliasingLoadOrStoreInMBB(
- MachineInstr &MI,
- MachineBasicBlock &MBB,
- AliasAnalysis *AA
-)
-{
- if (MI.mayLoadOrStore())
- {
- for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); I != E; ++I)
- {
- const bool UseTBAA = false;
- if (MI.mayAlias(AA, *I, UseTBAA))
- {
- return &*I;
- }
- }
+static MachineInstr *findFirstAliasingLoadOrStoreInMBB(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ AliasAnalysis *AA) {
+ if (MI.mayLoadOrStore()) {
+ for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end();
+ I != E; ++I) {
+ const bool UseTBAA = false;
+ if (MI.mayAlias(AA, *I, UseTBAA)) {
+ return &*I;
+ }
}
+ }
- return nullptr;
+ return nullptr;
}
-static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI,
- AliasAnalysis *AA,
- SlotIndexes *slotIndexes) {
+static MachineInstr *findPacifistInsertPoint(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ MachineRegisterInfo &MRI,
+ AliasAnalysis *AA,
+ SlotIndexes *slotIndexes) {
SmallVector<MachineInstr *, 2> users;
@@ -2739,14 +2706,13 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock
// op with which it aliases. Find the first instruction
// that aliases the pacifist MI (if any) and add it to the list
// of users. The sort() below will select the earliest user instruction.
- if (MachineInstr* AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
+ if (MachineInstr *AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
users.push_back(AliasMI);
}
for (MachineOperand &MO : MI.defs()) {
unsigned Reg = MO.getReg();
- for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg))
- {
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
if (&MBB != UseMI.getParent())
continue;
users.emplace_back(&UseMI);
@@ -2770,8 +2736,7 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock
bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
const SIInstrInfo *SIII, AliasAnalysis *AA,
- RematStatus &status)
-{
+ RematStatus &status) {
const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
@@ -2792,7 +2757,8 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
// Move pacifist to its first user.
for (MachineInstr *MI : pacifistList) {
- MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
+ MachineInstr *firstUser =
+ findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
if (firstUser == MI)
continue;
if (firstUser == MI->getNextNode())
@@ -2809,14 +2775,15 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
// BRANCH may have exec update before it.
insertPoint--;
- insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+ insertPoint =
+ llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) ||
insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) &&
- insertPoint != MI->getIterator())
- {
+ insertPoint != MI->getIterator()) {
insertPoint--;
- insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+ insertPoint =
+ llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
}
if (insertPoint == MI->getIterator())
continue;
@@ -2882,7 +2849,7 @@ bool collectVToSCrossHotSpot(
const SIInstrInfo *SIII) {
unsigned VLimit = status.TargetVLimit;
unsigned SLimit = status.TargetSLimit;
- auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
GCNDownwardRPTracker Tracker(*LIS);
@@ -2921,24 +2888,23 @@ bool collectVToSCrossHotSpot(
VExtra--;
bUpdated = true;
}
-
}
return bUpdated;
}
// Return true if the user is outside of the def's loop.
-static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI)
-{
- MachineLoop* L = MLI->getLoopFor(Def->getParent());
+static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User,
+ MachineLoopInfo *MLI) {
+ MachineLoop *L = MLI->getLoopFor(Def->getParent());
return L && !L->contains(User->getParent());
}
bool rematUniformVgprToSgpr(
- Remat *Remat,
- MachineFunction &MF, RematStatus &status,
+ Remat *Remat, MachineFunction &MF, RematStatus &status,
DenseMap<MachineBasicBlock *, GCNRegPressure> &MBBPressureMap,
- std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS, MachineRegisterInfo &MRI,
- const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
+ std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
DenseMap<unsigned, MachineInstr *> UniformVgprMap =
collectUniformVgprs(Remat, MF, MRI, SIRI);
@@ -2972,7 +2938,8 @@ bool rematUniformVgprToSgpr(
// Do not replace v->s across loops. Even if the value is uniform
// branch divergence can cause a uniform value in a loop to be
// non-uniform when used outside a loop.
- if (IsSafeRematCandidateUser(&userMI, SIII) && !IsCrossLoopUse(MI, &userMI, MLI))
+ if (IsSafeRematCandidateUser(&userMI, SIII) &&
+ !IsCrossLoopUse(MI, &userMI, MLI))
userMIs.emplace_back(&userMI);
}
@@ -2988,7 +2955,7 @@ bool rematUniformVgprToSgpr(
for (MachineInstr *userMI : userMIs) {
const auto &Desc = userMI->getDesc();
bool bIllegal = false;
- for (unsigned i=0;i<userMI->getNumOperands();i++) {
+ for (unsigned i = 0; i < userMI->getNumOperands(); i++) {
MachineOperand &MO = userMI->getOperand(i);
if (!MO.isReg())
continue;
@@ -3021,7 +2988,8 @@ bool rematUniformVgprToSgpr(
auto rit = userMI->getReverseIterator();
rit++;
auto endIt = userMI->getParent()->rend();
- while (rit != endIt && !rit->isDebugInstr() && !slotIndexes->hasIndex(*rit))
+ while (rit != endIt && !rit->isDebugInstr() &&
+ !slotIndexes->hasIndex(*rit))
slotIndexes->insertMachineInstrInMaps(*(rit++));
}
}
@@ -3107,9 +3075,8 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
DenseSet<MachineInstr *> &hotSet, int vDistance, int sDistance,
unsigned VLimit, unsigned SLimit,
const DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
- LiveIntervals *LIS,
- const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
- const SIInstrInfo *SIII) {
+ LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex();
const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
@@ -3134,7 +3101,8 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
continue;
// Igonre inst in hot range.
- if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || RP.getMaxSGPR() > SLimit) {
+ if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit ||
+ RP.getMaxSGPR() > SLimit) {
Tracker.advance();
continue;
}
@@ -3249,7 +3217,7 @@ bool tryRematInHotSpot(
unsigned VLimit = status.TargetVLimit;
unsigned SLimit = status.TargetSLimit;
- auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
@@ -3300,9 +3268,8 @@ bool tryRematInHotSpot(
// Use hotVMI when apply.
inBlockHotSInstMap[&MBB] = nullptr;
if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive,
- outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
- status.MemWriteMBBSet,
- LIS, MRI, SIRI, SIII))
+ outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
+ status.MemWriteMBBSet, LIS, MRI, SIRI, SIII))
return true;
}
@@ -3312,8 +3279,7 @@ bool tryRematInHotSpot(
inBlockHotVInstMap[&MBB] = nullptr;
return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false,
inputLive, outputLive, hotSet, vDistance, sDistance, VLimit,
- SLimit, status.MemWriteMBBSet,
- LIS, MRI, SIRI, SIII);
+ SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII);
}
return false;
}
@@ -3444,7 +3410,8 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
}
}
-// Compare pressure, return ture if maxV0/maxS0 pressure is higher than maxV1/maxS1.
+// Compare pressure, return ture if maxV0/maxS0 pressure is higher than
+// maxV1/maxS1.
bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
unsigned maxS1, const GCNSubtarget *ST) {
unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0);
@@ -3467,10 +3434,11 @@ bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
}
// Return true if the subExp can help pressure for passThrus.
-bool canHelpPressureWhenSink(SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
- const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
- const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
- MachineDominatorTree *pDT, bool bCanClone,bool bSgprBound) {
+bool canHelpPressureWhenSink(
+ SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+ MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound) {
LLVM_DEBUG(subExp.dump(MRI, SIRI));
if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
return false;
@@ -3586,8 +3554,7 @@ bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI,
}
SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
-groupPassThruByDefBlock(Remat *Remat,
- const GCNRPTracker::LiveRegSet &passThrus,
+groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus,
GCNRPTracker::LiveRegSet &usedPassThrus,
MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
const SIInstrInfo *SIII) {
@@ -3613,16 +3580,17 @@ groupPassThruByDefBlock(Remat *Remat,
GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()];
DefInMBB[Reg] = it.second;
}
-
- llvm::SmallVector<std::pair<MachineBasicBlock*, GCNRPTracker::LiveRegSet>> result = Candidates.takeVector();
- LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it
- : result) {
- MachineBasicBlock *MBB = it.first;
- auto &defInMBB = it.second;
- MBB->dump();
- llvm::dumpLiveSet(defInMBB, SIRI);
- } llvm::dbgs() << "end of candidates\n";);
+ llvm::SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+ result = Candidates.takeVector();
+
+ LLVM_DEBUG(
+ llvm::dbgs() << "Before sort candidates\n"; for (auto it : result) {
+ MachineBasicBlock *MBB = it.first;
+ auto &defInMBB = it.second;
+ MBB->dump();
+ llvm::dumpLiveSet(defInMBB, SIRI);
+ } llvm::dbgs() << "end of candidates\n";);
std::sort(result.begin(), result.end(),
[](std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it0,
@@ -3630,8 +3598,7 @@ groupPassThruByDefBlock(Remat *Remat,
return it0.first->getNumber() < it1.first->getNumber();
});
- LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it
- : result) {
+ LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it : result) {
MachineBasicBlock *MBB = it.first;
auto &defInMBB = it.second;
MBB->dump();
@@ -3688,7 +3655,8 @@ collectPassThrus(MachineBasicBlock *MBB,
return passThrus;
}
// Try to build a free subExp which all input is passThrus.
-SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, GCNRPTracker::LiveRegSet &passThrus,
+SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
+ GCNRPTracker::LiveRegSet &passThrus,
MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
SubExp freeExp;
// Try to split the subExp to find a help case.
@@ -3813,9 +3781,9 @@ std::vector<SubExp> buildSubExpCandidates(
// Try to remove out reg def sub exp from DefMBB.
GCNRPTracker::LiveRegSet &DefInMBB = it.second;
// Go up on the dag until reach share node.
- auto subExps =
- buildSubExpFromCandidates(Remat, DefInMBB, DefMBB, SIRI, SIII, MRI,
- slotIndexes, unUsedPassThrus, bAllowPartialUseInSubExp);
+ auto subExps = buildSubExpFromCandidates(
+ Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, slotIndexes, unUsedPassThrus,
+ bAllowPartialUseInSubExp);
for (SubExp &subExp : subExps) {
if (subExp.bHasMemInst) {
// Skip when memory ld/st inst need to cross MBB which write memory.
@@ -3842,11 +3810,13 @@ std::vector<SubExp> buildSubExpCandidates(
}
}
if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
- bCanClone, bSgprBound)) {
- if (bAllowPartialUseInSubExp && subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
- SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
- if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
- bCanClone, bSgprBound)) {
+ bCanClone, bSgprBound)) {
+ if (bAllowPartialUseInSubExp &&
+ subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
+ SubExp freeSubExp =
+ buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
+ if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII,
+ MLI, pDT, bCanClone, bSgprBound)) {
subExpCandidates.emplace_back(freeSubExp);
}
}
@@ -3931,8 +3901,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
LLVM_DEBUG(std::string movStr =
Exp.bHoist ? "output hoist:" : "output sink:";
- dbgs() << movStr << Register::virtReg2Index(Reg)
- << " " << Size);
+ dbgs()
+ << movStr << Register::virtReg2Index(Reg) << " " << Size);
// Exp out live at block input.
// It will descrease live for MBB when sink and increase when hoist.
if (SIRI->isVGPR(MRI, Reg)) {
@@ -3969,10 +3939,9 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
// It will increase live for MBB.
unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
- LLVM_DEBUG(std::string movStr =
- Exp.bHoist ? "input hoist:" : "input sink:";
- dbgs() << movStr << Register::virtReg2Index(Reg)
- << " " << Size);
+ LLVM_DEBUG(
+ std::string movStr = Exp.bHoist ? "input hoist:" : "input sink:";
+ dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size);
if (SIRI->isVGPR(MRI, Reg)) {
LLVM_DEBUG(dbgs() << "v\n");
if (Exp.bHoist)
@@ -4014,8 +3983,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
LaneBitmask profitMask = outMask & MBBBeginMask;
if (MBBBeginMask.any()) {
unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
- LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg)
- << " " << Size);
+ LLVM_DEBUG(dbgs()
+ << "move:" << Register::virtReg2Index(Reg) << " " << Size);
// Exp out live at block input.
// It will descrease live for MBB.
if (SIRI->isVGPR(MRI, Reg)) {
@@ -4043,8 +4012,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
// It will increase live for MBB.
unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
- LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg)
- << " " << Size);
+ LLVM_DEBUG(dbgs()
+ << "add:" << Register::virtReg2Index(Reg) << " " << Size);
if (SIRI->isVGPR(MRI, Reg)) {
LLVM_DEBUG(dbgs() << "v\n");
vgprDiff += Size;
@@ -4090,8 +4059,8 @@ void addExpCandidates(std::vector<SubExp> &subExpCandidates,
}
bool tryToAddSubExps(
- Remat *Remat,
- HotBlock &hotBB, RematStatus &status, std::vector<SubExp> &subExpCandidates,
+ Remat *Remat, HotBlock &hotBB, RematStatus &status,
+ std::vector<SubExp> &subExpCandidates,
std::vector<SubExp> &inBlockCloneSubExps,
DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
@@ -4105,9 +4074,9 @@ bool tryToAddSubExps(
SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT,
bool bCanClone, bool bVOutBound, bool bSOutBound,
GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
- std::vector<SubExp> partialSubExps = buildSubExpCandidates(Remat,
- Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, bCanClone,
- bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
+ std::vector<SubExp> partialSubExps = buildSubExpCandidates(
+ Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT,
+ bCanClone, bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
bAllowPartialUseInSubExp);
GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive;
@@ -4177,8 +4146,8 @@ bool tryToAddSubExps(
// Try to remove out reg def sub exp from DefMBB.
GCNRPTracker::LiveRegSet &UseInMBB = it.second;
// Go up on the dag until reach share node.
- auto subExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, SIRI,
- SIII, MRI, slotIndexes);
+ auto subExps = buildSubExpFromCandidatesTopBottom(
+ Remat, UseInMBB, UseMBB, SIRI, SIII, MRI, slotIndexes);
for (SubExp &subExp : subExps) {
if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound))
continue;
@@ -4211,8 +4180,7 @@ bool tryToAddSubExps(
if (EnableVmemDegree &&
// Only expect vmem when last tryToAddSubExps.
// If not, bAllowPartialUseInSubExp will no chance to be true.
- (bAllowPartialUseInSubExp ||
- !EnableSubExpAggressive)) {
+ (bAllowPartialUseInSubExp || !EnableSubExpAggressive)) {
// Assume vmemLdSize could be optimized by not parallel.
if (((vgpr - hotBB.vmemLdInputSize) <= VLimit ||
(vgpr - hotBB.vmemLdOutputSize) <= VLimit) &&
@@ -4251,8 +4219,7 @@ bool tryToAddSubExps(
// Reason to do it per block is to make sure passthru reuse is precise.
// If try remat on all hot blocks together, the passthru might be on one block,
// but the reuse in on another block which the reg is not passthru there.
-bool perBlockPassthruRemat(Remat *Remat,
- std::vector<HotBlock> &hotBlocks,
+bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
RematStatus &status,
GCNRPTracker::LiveRegSet &liveRegCandidates,
const GCNSubtarget *ST, LiveIntervals *LIS,
@@ -4261,8 +4228,7 @@ bool perBlockPassthruRemat(Remat *Remat,
const SIRegisterInfo *SIRI,
const SIInstrInfo *SIII) {
bool bUpdated = false;
- bool bCanClone = EnableSubExpClone |
- EnableSubExpAggressive;
+ bool bCanClone = EnableSubExpClone | EnableSubExpAggressive;
SlotIndexes *slotIndexes = LIS->getSlotIndexes();
// Sort hot blocks by pressure first.
@@ -4326,19 +4292,19 @@ bool perBlockPassthruRemat(Remat *Remat,
// Group pass thru regs by def MBB.
SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
- Candidates =
- groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, MRI, SIRI, SIII);
+ Candidates = groupPassThruByDefBlock(Remat, passThrus, usedPassThrus,
+ MRI, SIRI, SIII);
// unUsedPassThrus used to collect passThru which is skipped when build
// subExp.
GCNRPTracker::LiveRegSet unusedPassThrus;
// Build exp dag on define blocks.
bool bAllowPartialUseInSubExp = false;
- if (tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
- inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
- vgpr, sgpr, savingInputLive, savingOutputLive,
- passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
- LIS, pDT, bCanClone, bVOutBound, bSOutBound,
- unusedPassThrus, bAllowPartialUseInSubExp)) {
+ if (tryToAddSubExps(
+ Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+ inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
+ savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
+ SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+ unusedPassThrus, bAllowPartialUseInSubExp)) {
// Remove unusedPassThrus from passThrus first.
llvm::andNotLiveRegSet(passThrus, unusedPassThrus);
llvm::mergeLiveRegSet(usedPassThrus, passThrus);
@@ -4354,12 +4320,12 @@ bool perBlockPassthruRemat(Remat *Remat,
return false;
bAllowPartialUseInSubExp = true;
- if (!tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
- inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
- vgpr, sgpr, savingInputLive, savingOutputLive,
- passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
- LIS, pDT, bCanClone, bVOutBound, bSOutBound,
- unusedPassThrus, bAllowPartialUseInSubExp)) {
+ if (!tryToAddSubExps(
+ Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+ inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
+ savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
+ SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+ unusedPassThrus, bAllowPartialUseInSubExp)) {
return false;
}
// Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp.
@@ -4425,10 +4391,9 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
} // namespace
-bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
- MachineDominatorTree *pDT, MachinePostDominatorTree *pPDT,
- AliasAnalysis *AA)
-{
+bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, MachineDominatorTree *pDT,
+ MachinePostDominatorTree *pPDT, AliasAnalysis *AA) {
if (MF.size() < 2)
return false;
const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
@@ -4490,7 +4455,6 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
maxLocalSPressure, status);
maxLocalSPressure += RegForVCC;
-
}
if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
continue;
@@ -4499,7 +4463,9 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
if (bBothOutLimit && maxLocalVPressure <= VLimit)
continue;
GCNRPTracker::LiveRegSet liveSet;
- hotBlocks.push_back({ &MBB, liveSet,std::make_pair(maxLocalVPressure, maxLocalSPressure), 0, 0 });
+ hotBlocks.push_back({&MBB, liveSet,
+ std::make_pair(maxLocalVPressure, maxLocalSPressure),
+ 0, 0});
}
// Collect vmemLdInput/OutputSize.
if (EnableVmemDegree) {
@@ -4541,8 +4507,8 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
}
if (EnableUniformVectorToScalar) {
- if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, hotBlocks, LIS, MRI,
- SIRI, SIII, MLI)) {
+ if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap,
+ hotBlocks, LIS, MRI, SIRI, SIII, MLI)) {
// Rebuild LIS.
LIS->reanalyze(MF);
status = GetRematStatus(MF, MLI, LIS, MRI, ST);
@@ -4596,15 +4562,17 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
PressureUnderLimitSet.insert(MBB);
} else {
if (MaxLocalVGPR < it.maxPressures.first)
- it.maxPressures = std::make_pair(MaxLocalVGPR, it.maxPressures.second);
+ it.maxPressures =
+ std::make_pair(MaxLocalVGPR, it.maxPressures.second);
if (MaxLocalSGPR < it.maxPressures.second)
it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR);
}
}
}
- bool bUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates,
- ST, LIS, MLI, pDT, MRI, SIRI, SIII);
+ bool bUpdated =
+ perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, ST,
+ LIS, MLI, pDT, MRI, SIRI, SIII);
return bUpdated;
}
@@ -4613,8 +4581,10 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
if (MF.size() < 2)
return false;
LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
- MachineDominatorTree *DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
- MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+ MachineDominatorTree *DT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MachinePostDominatorTree *PDT =
+ &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
@@ -4629,8 +4599,8 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
}
}
- //LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
- // For non-cs/ps, set target occ as 4.
+ // LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
+ // For non-cs/ps, set target occ as 4.
bool bNearTarget = false;
bool bFinalUpdated = false;
bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget);
@@ -4655,8 +4625,8 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, "AMDGPU rematerialize",
- false, false)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+ "AMDGPU rematerialize", false, false)
char AMDGPUHotBlockRematerialize::ID = 0;
char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
@@ -4664,4 +4634,3 @@ char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
return new AMDGPUHotBlockRematerialize();
}
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 6f44fec082..5336fde4cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -1,22 +1,21 @@
///////////////////////////////////////////////////////////////////////////////
// //
-// AMDGPUMIRUtils.cpp //
-// Copyright (C) Microsoft Corporation. All rights reserved. //
-// This file is distributed under the University of Illinois Open Source //
-// License. See LICENSE.TXT for details. //
+// AMDGPUMIRUtils.cpp // Copyright (C) Microsoft Corporation. All rights
+// reserved. // This file is distributed under the University of
+// Illinois Open Source // License. See LICENSE.TXT for details. //
// //
// Util functions for llvm MIR Passes. //
// //
///////////////////////////////////////////////////////////////////////////////
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
-//#include "dxc/DXIL/DxilMetadataHelper.h"
+// #include "dxc/DXIL/DxilMetadataHelper.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/raw_ostream.h"
@@ -26,9 +25,9 @@
#include "llvm/Support/Debug.h"
-#include "GCNRegPressure.h"
#include "AMDGPUMIRUtils.h"
#include "AMDGPUSubExpDag.h"
+#include "GCNRegPressure.h"
#include <unordered_set>
#define DEBUG_TYPE "xb-mir-util"
@@ -48,7 +47,7 @@ public:
phiInsts.insert(&I);
unsigned Reg = I.getOperand(0).getReg();
// Add incoming values.
- for (unsigned i=1;i<I.getNumOperands();i+=2) {
+ for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
MachineOperand &MO = I.getOperand(i);
if (!MO.isReg())
continue;
@@ -66,7 +65,8 @@ public:
} /// Adds custom features for a visualization of the ScheduleDAG.
void addCustomGraphFeatures(llvm::GraphWriter<CFGWithPhi *> &) const {}
MachineFunction &F;
- DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>> blockToPhiInstsMap;
+ DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>>
+ blockToPhiInstsMap;
void dump();
};
@@ -110,7 +110,8 @@ template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
return R;
}
- static std::string getNodeLabel(const MachineBasicBlock *BB, const CFGWithPhi *G) {
+ static std::string getNodeLabel(const MachineBasicBlock *BB,
+ const CFGWithPhi *G) {
enum { MaxColumns = 8000 };
std::string Str;
raw_string_ostream OS(Str);
@@ -347,7 +348,7 @@ void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
}
MachineBasicBlock *split(MachineInstr *Inst) {
-
+
// Create the fall-through block.
MachineBasicBlock *MBB = Inst->getParent();
MachineFunction *MF = MBB->getParent();
@@ -462,9 +463,8 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
.addImm(offset * LaneSize);
MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
MachineBasicBlock::iterator InsertPoint =
- llvm::FindOrCreateInsertionPointForSccDef(
- MI.getParent(), MI, SIRI, SIII, &MRI
- );
+ llvm::FindOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
+ SIII, &MRI);
MI.getParent()->insert(InsertPoint, OffsetAddMI);
SIII->legalizeOperands(*OffsetAddMI);
OffsetOp->setReg(NewOffsetReg);
@@ -631,7 +631,7 @@ bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT,
return bCross;
}
-}
+} // namespace llvm
namespace llvm {
void viewCFGWithPhi(llvm::MachineFunction &F) {
@@ -1520,12 +1520,12 @@ void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) {
}
} // namespace pressure
-}// namespace llvm
+} // namespace llvm
namespace {
class ContributionList {
public:
- ContributionList(MachineFunction &MF) : MF(MF){};
+ ContributionList(MachineFunction &MF) : MF(MF) {};
void build();
bool propagateContribution();
MachineFunction &MF;
@@ -1754,46 +1754,45 @@ void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) {
}
} // namespace llvm
-static bool IsPhysReg(const MachineOperand &Op)
-{
- return Op.isReg() && Op.getReg().isPhysical();
+static bool IsPhysReg(const MachineOperand &Op) {
+ return Op.isReg() && Op.getReg().isPhysical();
}
// Sometimes split bb uses physical registers defined in BB, have to add them to
// live-in or the ir is malformed.
-void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI)
-{
- // Initialize with current set of liveins. For new blocks this will be empty.
- SmallDenseSet<unsigned, 8> DefSet;
- for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins())
- {
- DefSet.insert(P.PhysReg);
- }
+void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB,
+ const MachineRegisterInfo *MRI) {
+ // Initialize with current set of liveins. For new blocks this will be empty.
+ SmallDenseSet<unsigned, 8> DefSet;
+ for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins()) {
+ DefSet.insert(P.PhysReg);
+ }
- for (auto &MI : *NewBB)
- {
- // Add all undefined physical registers to the live in set.
- for (MachineOperand &Use : MI.operands())
- {
- // Only process physreg uses.
- if (!IsPhysReg(Use) || !Use.isUse()) continue;
+ for (auto &MI : *NewBB) {
+ // Add all undefined physical registers to the live in set.
+ for (MachineOperand &Use : MI.operands()) {
+ // Only process physreg uses.
+ if (!IsPhysReg(Use) || !Use.isUse())
+ continue;
- // Reserved regs do not need to be tracked through live-in sets.
- unsigned Reg = Use.getReg();
- if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue;
+ // Reserved regs do not need to be tracked through live-in sets.
+ unsigned Reg = Use.getReg();
+ if (Use.isImplicit() && MRI && MRI->isReserved(Reg))
+ continue;
- if (!DefSet.count(Reg))
- NewBB->addLiveIn(Reg);
- }
+ if (!DefSet.count(Reg))
+ NewBB->addLiveIn(Reg);
+ }
- // Add all physical register defs (exlicit+implicit) to the def register set.
- for (MachineOperand &Def : MI.operands())
- {
- // Only process physreg defs.
- if (!IsPhysReg(Def) || !Def.isDef()) continue;
- DefSet.insert(Def.getReg());
- }
+ // Add all physical register defs (exlicit+implicit) to the def register
+ // set.
+ for (MachineOperand &Def : MI.operands()) {
+ // Only process physreg defs.
+ if (!IsPhysReg(Def) || !Def.isDef())
+ continue;
+ DefSet.insert(Def.getReg());
}
+ }
}
void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
@@ -1829,50 +1828,41 @@ void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
}
}
-MachineReg llvm::CreateVirtualRegForOperand(
- MachineOpcode Opcode,
- unsigned OpNum,
- MachineFunction &MF
-)
-{
- const TargetSubtargetInfo &ST = MF.getSubtarget();
- const TargetRegisterInfo *TRI = ST.getRegisterInfo();
- const TargetInstrInfo *TII = ST.getInstrInfo();
- const MCInstrDesc &Desc = TII->get(Opcode);
- const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
- if (!RC)
- {
- llvm::report_fatal_error("Unable to create virtual reg for instruction operand");
- }
+MachineReg llvm::CreateVirtualRegForOperand(MachineOpcode Opcode,
+ unsigned OpNum,
+ MachineFunction &MF) {
+ const TargetSubtargetInfo &ST = MF.getSubtarget();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetInstrInfo *TII = ST.getInstrInfo();
+ const MCInstrDesc &Desc = TII->get(Opcode);
+ const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
+ if (!RC) {
+ llvm::report_fatal_error(
+ "Unable to create virtual reg for instruction operand");
+ }
- MachineRegisterInfo &MRI = MF.getRegInfo();
- return MRI.createVirtualRegister(RC);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ return MRI.createVirtualRegister(RC);
}
-MachineReg llvm::CreateVirtualDstReg(
- MachineOpcode Opcode,
- MachineFunction &MF
-)
-{
- return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
+MachineReg llvm::CreateVirtualDstReg(MachineOpcode Opcode,
+ MachineFunction &MF) {
+ return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
}
// Return true if the MI is a copy of exec.
// If true then sets pDst to the destination register.
-bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
-{
- enum {DST=0, SRC=1};
- bool FoundCopy = false;
- if (MI.getOpcode() == AMDGPU::COPY
- || MI.getOpcode() == AMDGPU::S_MOV_B32
- || MI.getOpcode() == AMDGPU::S_MOV_B64)
- {
- const MachineOperand &Src = MI.getOperand(SRC);
- if (Src.isReg() && Src.getReg() == Exec)
- {
- FoundCopy = true;
- }
+bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec,
+ MachineReg *pDst) {
+ enum { DST = 0, SRC = 1 };
+ bool FoundCopy = false;
+ if (MI.getOpcode() == AMDGPU::COPY || MI.getOpcode() == AMDGPU::S_MOV_B32 ||
+ MI.getOpcode() == AMDGPU::S_MOV_B64) {
+ const MachineOperand &Src = MI.getOperand(SRC);
+ if (Src.isReg() && Src.getReg() == Exec) {
+ FoundCopy = true;
}
+ }
#if 0 // TODO: Delete this.
else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO ||
MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32)
@@ -1880,29 +1870,26 @@ bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
FoundCopy = true;
}
#endif
-
- if (FoundCopy)
- {
- *pDst = MI.getOperand(DST).getReg();
- }
- return FoundCopy;
+ if (FoundCopy) {
+ *pDst = MI.getOperand(DST).getReg();
+ }
+
+ return FoundCopy;
}
-llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF)
-{
- llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister};
- if (MachineInstr* MI = GetWqmEntryActiveMaskInst(MF))
- {
- LiveLaneMask.Reg = MI->getOperand(0).getReg();
- LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
- }
+llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) {
+ llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister,
+ AMDGPU::NoSubRegister};
+ if (MachineInstr *MI = GetWqmEntryActiveMaskInst(MF)) {
+ LiveLaneMask.Reg = MI->getOperand(0).getReg();
+ LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
+ }
- return LiveLaneMask;
+ return LiveLaneMask;
}
-MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
-{
+MachineInstr *llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) {
#if 0 // TODO: Get rid of this
// Look forward in the entry block for the SET_LIVE_LANE_MASK instruction.
// This instruction is added by the SIWholeQuadMode pass.
@@ -1917,22 +1904,23 @@ MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
}
#endif
- return nullptr;
+ return nullptr;
}
-bool llvm::IsFetchShaderCall(const MachineInstr *MI)
-{
+bool llvm::IsFetchShaderCall(const MachineInstr *MI) {
#if 0 // TODO: Get rid of this.
return
MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER ||
MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall);
#else
- return false;
+ return false;
#endif
}
-bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) {
- const TargetRegisterInfo* TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB,
+ llvm::MachineBasicBlock::iterator MI) {
+ const TargetRegisterInfo *TRI =
+ MBB->getParent()->getRegInfo().getTargetRegisterInfo();
for (auto it = MI; it != MBB->end(); ++it) {
const MachineInstr &CurMI = *it;
// Hit use of scc, it is live.
@@ -1962,79 +1950,70 @@ bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::it
// as the new insert location.
//
MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
- MachineBasicBlock *MBB,
- MachineBasicBlock::iterator MI,
- const TargetRegisterInfo* TRI,
- const SIInstrInfo* TII,
- MachineRegisterInfo* MRI,
- SccDefInsertPointConstraintFlags Constraints
-)
-{
- // If SCC is dead at MI when we can use MI as the insert point.
- if (!llvm::IsSccLiveAt(MBB, MI))
- {
- return MI;
- }
+ MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
+ const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
+ MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
+ // If SCC is dead at MI when we can use MI as the insert point.
+ if (!llvm::IsSccLiveAt(MBB, MI)) {
+ return MI;
+ }
- const bool CheckForExecWrite =
- Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+ const bool CheckForExecWrite =
+ Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
- // Get the starting reverse iterator taking care to handle the MBB->end() case.
- MachineBasicBlock::reverse_iterator Start;
- if (MI == MBB->end())
- {
- Start = MBB->rbegin();
- }
- else
- {
- Start = MI.getReverse();
- }
-
- // Otherwise, walk backwards through the block looking for a location where
- // SCC is dead.
- for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); It != End; ++It)
- {
- // If the instruction modifies exec then we cannot use it as
- // an insertion point (if that is a constraint from the caller).
- // The check for EXEC works for both wave64 and wave32 because
- // it will also catch writes to the subregisters (e.g. exec_lo).
- if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
- {
- break;
- }
+ // Get the starting reverse iterator taking care to handle the MBB->end()
+ // case.
+ MachineBasicBlock::reverse_iterator Start;
+ if (MI == MBB->end()) {
+ Start = MBB->rbegin();
+ } else {
+ Start = MI.getReverse();
+ }
- if (It->modifiesRegister(AMDGPU::SCC, TRI)
- && !It->readsRegister(AMDGPU::SCC, TRI))
- {
- return It->getIterator();
- }
+ // Otherwise, walk backwards through the block looking for a location where
+ // SCC is dead.
+ for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend();
+ It != End; ++It) {
+ // If the instruction modifies exec then we cannot use it as
+ // an insertion point (if that is a constraint from the caller).
+ // The check for EXEC works for both wave64 and wave32 because
+ // it will also catch writes to the subregisters (e.g. exec_lo).
+ if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) {
+ break;
}
- // If no safe location can be found in the block we can save and restore
- // SCC around MI. There is no way to directly read or write SCC so we use
- // s_cselect to read the current value of SCC and s_cmp to write the saved
- // value back to SCC.
- //
- // The generated code will look like this;
- //
- // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC
- // <----- Newly created safe insert point.
- // MI
- // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC
- //
- unsigned int TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- DebugLoc DL = MI->getDebugLoc();
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
- .addImm(-1)
- .addImm(0);
- BuildMI(*MBB, std::next(MI->getIterator()), DL, TII->get(AMDGPU::S_CMP_LG_U32))
- .addReg(TmpScc, RegState::Kill)
- .addImm(0);
+ if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
+ !It->readsRegister(AMDGPU::SCC, TRI)) {
+ return It->getIterator();
+ }
+ }
- return MI;
+ // If no safe location can be found in the block we can save and restore
+ // SCC around MI. There is no way to directly read or write SCC so we use
+ // s_cselect to read the current value of SCC and s_cmp to write the saved
+ // value back to SCC.
+ //
+ // The generated code will look like this;
+ //
+ // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC
+ // <----- Newly created safe insert point.
+ // MI
+ // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC
+ //
+ unsigned int TmpScc =
+ MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+ .addImm(-1)
+ .addImm(0);
+ BuildMI(*MBB, std::next(MI->getIterator()), DL,
+ TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(TmpScc, RegState::Kill)
+ .addImm(0);
+
+ return MI;
}
-
namespace {
bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
@@ -2099,9 +2078,7 @@ bool llvm::isLocalLiveInterval(
return isLocalLiveRange(&LI, Indexes, touchedMBBSet);
}
-
-bool llvm::isLocalLiveInterval(
- const LiveInterval &LI, SlotIndexes *Indexes) {
+bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
if (LI.hasSubRanges()) {
for (const auto &S : LI.subranges()) {
if (!isLocalLiveRange(&S, Indexes))
@@ -2117,8 +2094,8 @@ bool llvm::isLocalLiveInterval(
void llvm::buildEndLiveMap(
llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
const llvm::MachineRegisterInfo &MRI,
- llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
- &MBBLiveMap, bool After) {
+ llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet> &MBBLiveMap,
+ bool After) {
// When only have one block, end live reg must be empty.
if (MF.size() == 1)
return;
@@ -2158,7 +2135,8 @@ void llvm::buildEndLiveMap(
}
}
-unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF,
+ const SIRegisterInfo *SIRI) {
auto &MRI = MF.getRegInfo();
for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
if (MRI.isPhysRegUsed(Reg)) {
@@ -2168,14 +2146,16 @@ unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterIn
return 0;
}
-unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF,
+ const SIRegisterInfo *SIRI) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned MaxSGPR = 0;
for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
if (MRI.isPhysRegUsed(Reg)) {
- // Skip scratch reserved reg, which is a big register that don't really contribute to this stat.
+ // Skip scratch reserved reg, which is a big register that don't really
+ // contribute to this stat.
if (ScratchRSrcReg != 0) {
if (SIRI->isSubRegister(ScratchRSrcReg, Reg))
continue;
@@ -2187,8 +2167,7 @@ unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterIn
return 1 + llvm::RegForVCC + MaxSGPR;
}
-void llvm::dumpLiveSet(const LiveSet &LiveSet,
- const SIRegisterInfo *SIRI) {
+void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
dbgs() << "\n live set: \n";
for (auto it : LiveSet) {
@@ -2227,15 +2206,16 @@ bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage)
}
#endif
-MachineBasicBlock::succ_iterator llvm::FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ)
-{
- for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); It != End; ++It)
- {
- if (*It == Succ)
- {
- return It;
- }
+MachineBasicBlock::succ_iterator
+llvm::FindSuccessor(llvm::MachineBasicBlock *MBB,
+ llvm::MachineBasicBlock *Succ) {
+ for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(),
+ End = MBB->succ_end();
+ It != End; ++It) {
+ if (*It == Succ) {
+ return It;
}
+ }
- return MBB->succ_end();
+ return MBB->succ_end();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 16b55c5c94..b077fad4c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -2,9 +2,9 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/IR/CallingConv.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/LaneBitmask.h"
namespace llvm {
@@ -37,10 +37,10 @@ using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
const llvm::MachineRegisterInfo &MRI,
const llvm::SIRegisterInfo *SIRI);
-void CollectLiveSetPressure(
- const LiveSet &liveSet,
- const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
- unsigned &VPressure, unsigned &SPressure);
+void CollectLiveSetPressure(const LiveSet &liveSet,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI,
+ unsigned &VPressure, unsigned &SPressure);
bool isExecUpdateForControlFlow(llvm::MachineInstr &MI);
@@ -60,37 +60,34 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
const llvm::SIInstrInfo *TII,
llvm::SlotIndexes *SlotIndexes);
-bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT,
+bool reach_block(llvm::MachineBasicBlock *FromBB,
+ llvm::MachineDominatorTree *DT,
llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
llvm::MachineBasicBlock *ToBB);
-
void viewCFGWithPhi(llvm::MachineFunction &MF);
void write_contribution_list(llvm::MachineFunction &MF, const char *Filename);
-llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII);
+llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF,
+ const llvm::SIInstrInfo *TII);
bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
llvm::MachineBasicBlock &MBB);
-void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI);
+void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
+ const llvm::MachineRegisterInfo *MRI);
void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
- llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
- const llvm::MachineRegisterInfo *MRI);
+ llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
+ const llvm::MachineRegisterInfo *MRI);
-MachineReg CreateVirtualRegForOperand(
- MachineOpcode Opcode,
- unsigned Operand,
- llvm::MachineFunction &MF
-);
+MachineReg CreateVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand,
+ llvm::MachineFunction &MF);
-MachineReg CreateVirtualDstReg(
- MachineOpcode Opcode,
- llvm::MachineFunction &MF
-);
+MachineReg CreateVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF);
-bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst);
+bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec,
+ MachineReg *pDst);
struct MachineRegWithSubReg {
MachineReg Reg = AMDGPU::NoRegister;
unsigned SubReg = AMDGPU::NoSubRegister;
@@ -98,22 +95,22 @@ struct MachineRegWithSubReg {
MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF);
llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
-// Return true if this machine instruction represents a call to the fetch shader.
-// We curently have two mechanisims for calling fetch shader:
+// Return true if this machine instruction represents a call to the fetch
+// shader. We curently have two mechanisims for calling fetch shader:
// 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction
// 2. A CALL instruction with the `FetchShaderCall` flag set to true.
-bool IsFetchShaderCall(const llvm::MachineInstr* MI);
-
-bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI);
+bool IsFetchShaderCall(const llvm::MachineInstr *MI);
+bool IsSccLiveAt(llvm::MachineBasicBlock *MBB,
+ llvm::MachineBasicBlock::iterator MI);
// An enum used to pass additional constraints to
// `FindOrCreateInsertionPointForSccDef()`. This will further
// constrain the location where the scc def can be inserted.
-enum SccDefInsertPointConstraintFlags
-{
- None = 0, // No additional constraints.
- NoExecWrite = 1, // Should be no modification of exec between BeforeInst and insert point.
+enum SccDefInsertPointConstraintFlags {
+ None = 0, // No additional constraints.
+ NoExecWrite = 1, // Should be no modification of exec between BeforeInst and
+ // insert point.
};
// Look for a safe place to insert an instruction that defines scc.
@@ -130,55 +127,53 @@ enum SccDefInsertPointConstraintFlags
// as the new insert location.
//
llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef(
- llvm::MachineBasicBlock* MBB,
- llvm::MachineBasicBlock::iterator BeforeInst,
- const llvm::TargetRegisterInfo* TRI,
- const llvm::SIInstrInfo* TII,
- llvm::MachineRegisterInfo* MRI,
- SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None
-);
+ llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst,
+ const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII,
+ llvm::MachineRegisterInfo *MRI,
+ SccDefInsertPointConstraintFlags Constraints =
+ SccDefInsertPointConstraintFlags::None);
// Check if LI live cross basic blocks, save all touched basic block if is
// local.
bool isLocalLiveInterval(
const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &touchedMBBSet);
-bool isLocalLiveInterval(
- const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes);
+bool isLocalLiveInterval(const llvm::LiveInterval &LI,
+ llvm::SlotIndexes *Indexes);
// build liveRegSet at end of each MBB.
void buildEndLiveMap(
llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
const llvm::MachineRegisterInfo &MRI,
- llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
- &MBBLiveMap, bool After);
+ llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet> &MBBLiveMap, bool After);
-void dumpLiveSet(const LiveSet &LiveSet,
- const llvm::SIRegisterInfo *SIRI);
+void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
-unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
-unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF,
+ const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF,
+ const llvm::SIRegisterInfo *SIRI);
bool isFastMathInst(llvm::MachineInstr &MI);
namespace pressure {
void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI,
- const llvm::SIRegisterInfo *SIRI,
- llvm::raw_ostream &os);
+ const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &os);
void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
const char *Filename);
void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
llvm::raw_ostream &os);
-}
+} // namespace pressure
// bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage);
// Look for the successor `Succ` of the given `MBB`.
// Returns MBB->succ_end() if `Succ` is not a successor of MBB.
-llvm::MachineBasicBlock::succ_iterator FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ);
+llvm::MachineBasicBlock::succ_iterator
+FindSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ);
// The enum and helper function for v_perm selection mask.
//
-// The input byte layout of v_perm is as below:
+// The input byte layout of v_perm is as below:
//
// BYTE in[8]
// in[0] = $src1_BYTE0;
@@ -211,7 +206,7 @@ constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0,
V_PERM_IN_BYTE_POS Sel_1,
V_PERM_IN_BYTE_POS Sel_2,
V_PERM_IN_BYTE_POS Sel_3) {
- return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) |
- ((int)Sel_1 << 8) | (int)Sel_0);
-}
+ return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) | ((int)Sel_1 << 8) |
+ (int)Sel_0);
}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
index ceb22b5ff9..21aa5db0c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
@@ -69,7 +69,8 @@
// ...
//
// label:
-// v3 = phi v0, v1 ; divergent! because of divergent branch.
+// v3 = phi v0, v1 ; divergent! because of divergent
+// branch.
//
// The boolean value is bit-divergent. When passed to the branch as an operand,
// the branch becomes divergent, whose sync dependency will be computed as
@@ -81,13 +82,14 @@
// control flow.
// For case like
// %163:sreg_64_xexec = S_MOV_B64 $exec
-//bb.1:
+// bb.1:
//; predecessors: %bb.1, %bb.0
-// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
-// %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%),
+// %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
// %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
// %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
-// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec,
+// implicit-def $scc, implicit $exec
//...
// $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
// S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -164,20 +166,20 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPU.h"
#include "AMDGPUMirDivergenceAnalysis.h"
-#include "GCNSubtarget.h"
+#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "TargetInfo/AMDGPUTargetInfo.h"
-#include "SIInstrInfo.h"
-//#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
+// #include "llvm/Analysis/Passes.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/Support/Debug.h"
-//#include "newbe/cli/newbe_opts.h" // AMDGPU change.
+// #include "newbe/cli/newbe_opts.h" // AMDGPU change.
#include "llvm/Support/raw_ostream.h"
#include <vector>
@@ -1223,24 +1225,24 @@ bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) {
case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10:
case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si:
case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
- //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
+ // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10:
case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si:
@@ -1555,8 +1557,8 @@ bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
if (MO.isUse())
continue;
unsigned Reg = MO.getReg();
- if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
- Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO)
+ if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::VCC ||
+ Reg == AMDGPU::VCC_LO)
return true;
// Check if the written register class overlaps the bool register class.
@@ -1567,15 +1569,15 @@ bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
//
// The underlying problem is that we have two notions of divergence
// (bit divergence and wave divergence) but the algorithm only propagates
- // wave divergence. The bit divergence is important for bools because it determines
- // if a branch is uniform or not (and thus catches cases where a uniform value is
- // used outside of a divergent control flow region). For bool values the
- // algorithm will treat normally uniform values (i.e. scalar registers) as divergent
- // in order to try and propagate bit divergence.
+ // wave divergence. The bit divergence is important for bools because it
+ // determines if a branch is uniform or not (and thus catches cases where a
+ // uniform value is used outside of a divergent control flow region). For
+ // bool values the algorithm will treat normally uniform values (i.e. scalar
+ // registers) as divergent in order to try and propagate bit divergence.
//
- // To fix all the possible bugs here I think we need to actually proagate bit
- // divergence as well as wave divergences. That is a bigger fix and this check should
- // cover most cases of treating a bool value as divergent.
+ // To fix all the possible bugs here I think we need to actually proagate
+ // bit divergence as well as wave divergences. That is a bigger fix and this
+ // check should cover most cases of treating a bool value as divergent.
const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
if (SIRI->getCommonSubClass(BoolRC, RC))
return true;
@@ -1597,13 +1599,13 @@ bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
!MI->isTerminator())
return true;
break;
- //case AMDGPU::AMDGPU_MAKE_UNIFORM:
- //case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
+ // case AMDGPU::AMDGPU_MAKE_UNIFORM:
+ // case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
case AMDGPU::V_READFIRSTLANE_B32:
case AMDGPU::V_READLANE_B32:
- //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
- //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
- // bool readfirstlane should be 1 bit, which means bit uniform.
+ // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
+ // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
+ // bool readfirstlane should be 1 bit, which means bit uniform.
return true;
case AMDGPU::S_OR_B32:
case AMDGPU::S_OR_B64: {
@@ -1638,7 +1640,8 @@ bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
}
bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) {
- return reg.isPhysical();;
+ return reg.isPhysical();
+ ;
}
bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
@@ -1646,13 +1649,14 @@ bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
}
// For input reg of MF, vgpr will be divergent.
-bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
if (isPhysicalReg(MRI, Reg)) {
unsigned vir_reg = MRI.getLiveInVirtReg(Reg);
if (SIRI->isVGPR(MRI, vir_reg))
return true;
} else {
- if (SIRI->isVGPR(MRI, Reg))
+ if (SIRI->isVGPR(MRI, Reg))
return true;
}
return false;
@@ -1660,8 +1664,8 @@ bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegiste
bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI,
const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
- //if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
- // return true;
+ // if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
+ // return true;
if (isAMDGPUOpcodeDivergent(MI))
return true;
@@ -1715,8 +1719,7 @@ bool isWriteExec(const MachineInstr *MI) {
if (MO.isUse())
continue;
unsigned Reg = MO.getReg();
- if (Reg == AMDGPU::EXEC ||
- Reg == AMDGPU::EXEC_LO)
+ if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
return true;
}
return false;
@@ -1735,7 +1738,6 @@ bool isVCndMask(unsigned Opcode) {
}
}
-
bool isExecRegionOp(unsigned Op) {
switch (Op) {
default:
@@ -1812,17 +1814,18 @@ bool isInsideExecRegion(const MachineBasicBlock &MBB,
return PDT.dominates(RegionEndMBB, &MBB);
}
-// Map from BB to nearest Exec Region. How to build? Add every MBB unless already has smaller region?
-// Then when hit saveExec, propagate leaked users of define inside the exec region.
+// Map from BB to nearest Exec Region. How to build? Add every MBB unless
+// already has smaller region? Then when hit saveExec, propagate leaked users of
+// define inside the exec region.
} // namespace
namespace llvm {
// class DivergenceAnalysis
DivergenceAnalysis::DivergenceAnalysis(
- const MachineFunction &F, const MachineLoop *RegionLoop, const MachineDominatorTree &DT,
- const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI,
- SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
+ const MachineFunction &F, const MachineLoop *RegionLoop,
+ const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+ const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
// AMDGPU change begin.
DivergentJoinMapTy &JoinMap
// AMDGPU change end.
@@ -1841,7 +1844,7 @@ void DivergenceAnalysis::markDivergent(const ValueTy DivVal) {
LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *SIRI = ST->getRegisterInfo();
dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI););
- //AMDGPU change end.
+ // AMDGPU change end.
DivergentValues.insert(DivVal);
}
@@ -1948,7 +1951,7 @@ bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const {
// Check bit uniform here if not divergent.
return !isBitUniform(Term, Processed);
}
- //case AMDGPU::AMDGPU_CALL_INDIRECT:
+ // case AMDGPU::AMDGPU_CALL_INDIRECT:
case AMDGPU::SI_CALL:
return true;
}
@@ -1965,13 +1968,10 @@ bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
continue;
Register Reg = Op.getReg();
if (Reg.isPhysical()) {
- if (Reg == AMDGPU::EXEC ||
- Reg == AMDGPU::EXEC_LO ||
- Reg == AMDGPU::SCC)
+ if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::SCC)
continue;
- else
- if (const MachineInstr *DefMI =
- findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
+ else if (const MachineInstr *DefMI =
+ findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
if (isDivergent(*DefMI))
return true;
} else {
@@ -1986,15 +1986,17 @@ bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
return false;
}
-bool DivergenceAnalysis::isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
- const ValueTy Val,
- const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
- const MachineBasicBlock *DefBlock = &IncomingBlock; // AMDGPU change: Take def point as incoming block for constants.
+bool DivergenceAnalysis::isTemporalDivergent(
+ const MachineBasicBlock &ObservingBlock, const ValueTy Val,
+ const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
+ const MachineBasicBlock *DefBlock =
+ &IncomingBlock; // AMDGPU change: Take def point as incoming block for
+ // constants.
const auto *Inst = MRI.getUniqueVRegDef(Val);
if (Inst == nullptr)
return true;
if (Inst)
- DefBlock = Inst->getParent();
+ DefBlock = Inst->getParent();
// check whether any divergent loop carrying Val terminates before control
// proceeds to ObservingBlock
@@ -2020,13 +2022,14 @@ static bool HasIncomingUndefValue(const PHINode_ *Phi) {
// For case like
// %163:sreg_64_xexec = S_MOV_B64 $exec
-//bb.1:
+// bb.1:
//; predecessors: %bb.1, %bb.0
-// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
-// %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%),
+// %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
// %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
// %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
-// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec,
+// implicit-def $scc, implicit $exec
//...
// $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
// S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -2091,8 +2094,8 @@ findSaveExec(const MachineInstr *MI,
// It will only run on divergent branch, so (A, B) is not in
// DivergentDisjointMap when A is uniform.
static bool isJoinDivergentOnlyOnSameIncomingValue(
- const PHINode_ &Phi, const DivergenceAnalysis *pDA, const MachineDominatorTree &DT,
- DivergentJoinMapTy &DivergentJoinMap) {
+ const PHINode_ &Phi, const DivergenceAnalysis *pDA,
+ const MachineDominatorTree &DT, DivergentJoinMapTy &DivergentJoinMap) {
// for phi which join divergent, if the incoming values from divergent
// branch are the same, the phi is still uniform.
// A
@@ -2183,14 +2186,14 @@ bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
// joining divergent disjoint path in Phi parent block
if (isJoinDivergent(*Phi.getParent())) {
// AMDGPU CHANGE BEGIN
- if (true/*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
+ if (true /*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
// Continue if the divergent join only on same incoming value.
if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT,
DivergentJoinMap))
return true;
} else
- // AMDGPU CHANGE END
- return true;
+ // AMDGPU CHANGE END
+ return true;
}
// An incoming value could be divergent by itself.
@@ -2213,7 +2216,6 @@ bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
if (isDivergent(Reg) ||
isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB()))
return true;
-
}
return false;
@@ -2259,7 +2261,8 @@ bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const {
// marks all users of loop-carried values of the loop headed by LoopHeader as
// divergent
-void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) {
+void DivergenceAnalysis::taintLoopLiveOuts(
+ const MachineBasicBlock &LoopHeader) {
auto *DivLoop = LI.getLoopFor(&LoopHeader);
assert(DivLoop && "loopHeader is not actually part of a loop");
@@ -2324,7 +2327,7 @@ void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader)
}
}
-void DivergenceAnalysis::pushInstruction(const MachineInstr &I) {
+void DivergenceAnalysis::pushInstruction(const MachineInstr &I) {
Worklist.push_back(&I);
}
void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) {
@@ -2355,8 +2358,8 @@ void DivergenceAnalysis::pushUsers(const MachineInstr &I) {
}
}
-bool DivergenceAnalysis::propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
- const MachineLoop *BranchLoop) {
+bool DivergenceAnalysis::propagateJoinDivergence(
+ const MachineBasicBlock &JoinBlock, const MachineLoop *BranchLoop) {
LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
// ignore divergence outside the region
@@ -2403,8 +2406,10 @@ void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) {
}
}
-void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) {
- LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() << "\n");
+void DivergenceAnalysis::propagateLoopDivergence(
+ const MachineLoop &ExitingLoop) {
+ LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber()
+ << "\n");
// don't propagate beyond region
if (!inRegion(*ExitingLoop.getHeader()))
@@ -2444,20 +2449,21 @@ void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop)
// For case like
// %149:sreg_64_xexec = S_MOV_B64 $exec
//
-//bb.3:
+// bb.3:
//; predecessors: %bb.3, %bb.2
-// successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), %bb.4(50.00%)
+// successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%),
+// %bb.4(50.00%)
//
// %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3
// %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec
// %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec
-// %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
-// $m0 = S_MOV_B32 %153:sgpr_32
-// %55:vreg_512 = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit $exec
-// $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
+// %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec,
+// implicit-def $scc, implicit $exec $m0 = S_MOV_B32 %153:sgpr_32 %55:vreg_512
+// = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit
+// $exec $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
// S_CBRANCH_EXECNZ %bb.3, implicit $exec
//
-//bb.4:
+// bb.4:
//; predecessors: %bb.3
// successors: %bb.5(0x80000000); %bb.5(100.00%)
//
@@ -2596,7 +2602,7 @@ void DivergenceAnalysis::compute() {
// propagate divergence
while (!Worklist.empty()) {
- const MachineInstr *I= Worklist.back();
+ const MachineInstr *I = Worklist.back();
Worklist.pop_back();
// maintain uniformity of overrides
@@ -2715,23 +2721,23 @@ bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const {
void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const {
// iterate instructions using instructions() to ensure a deterministic order.
for (auto &MBB : F)
- for (auto &I : MBB) {
- if (isDivergent(I))
- OS << "DIVERGENT:" << I ;
- // AMDGPU changes begin
- else
- OS << "UNIFORM:" << I ;
- // AMDGPU changes end
- }
+ for (auto &I : MBB) {
+ if (isDivergent(I))
+ OS << "DIVERGENT:" << I;
+ // AMDGPU changes begin
+ else
+ OS << "UNIFORM:" << I;
+ // AMDGPU changes end
+ }
}
// class GPUDivergenceAnalysis
-MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(MachineFunction &F,
- const MachineDominatorTree &DT,
- const MachinePostDominatorTree &PDT,
- const MachineLoopInfo &LI)
- : SDA(DT, PDT, LI, /*AMDGPU change*/DivergentJoinMap),
- DA(F, nullptr, DT, PDT, LI, SDA, false, /*AMDGPU change*/DivergentJoinMap) {
+MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(
+ MachineFunction &F, const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+ : SDA(DT, PDT, LI, /*AMDGPU change*/ DivergentJoinMap),
+ DA(F, nullptr, DT, PDT, LI, SDA, false,
+ /*AMDGPU change*/ DivergentJoinMap) {
MachineRegisterInfo &MRI = F.getRegInfo();
const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *SIRI = ST->getRegisterInfo();
@@ -2758,10 +2764,11 @@ bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const {
return DA.isDivergent(*I);
}
-void MirGPUDivergenceAnalysis::print(raw_ostream &OS, const Module_ *mod) const {
+void MirGPUDivergenceAnalysis::print(raw_ostream &OS,
+ const Module_ *mod) const {
OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
DA.print(OS, mod);
OS << "}\n";
}
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
index edcf96ec44..d9fd4044c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
@@ -1,4 +1,5 @@
-//===- AMDGPUMirDivergenceAnalysis.h - Mir Divergence Analysis -*- C++ -*-===//
+//===- AMDGPUMirDivergenceAnalysis.h - Mir Divergence Analysis -*- C++
+//-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -14,11 +15,11 @@
#pragma once
-#include "llvm/ADT/DenseSet.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "AMDGPUMirSyncDependenceAnalysis.h"
#include "llvm/Pass.h"
#include <vector>
@@ -50,8 +51,10 @@ public:
/// Otherwise the whole function is analyzed.
/// \param IsLCSSAForm whether the analysis may assume that the IR in the
/// region in in LCSSA form.
- DivergenceAnalysis(const llvm::MachineFunction &F, const MachineLoop *RegionLoop,
- const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+ DivergenceAnalysis(const llvm::MachineFunction &F,
+ const MachineLoop *RegionLoop,
+ const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT,
const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA,
bool IsLCSSAForm,
// AMDGPU change begin.
@@ -98,10 +101,12 @@ private:
bool updateTerminator(const MachineInstr &Term) const;
bool updatePHINode(const PHINode_ &Phi) const;
bool updateVCndMask(const MachineInstr &VCndMask) const;
- bool isBitUniform(const MachineInstr &I,
- llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
- bool isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
- llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+ bool
+ isBitUniform(const MachineInstr &I,
+ llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+ bool
+ isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
+ llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
/// \brief Computes whether \p Inst is divergent based on the
/// divergence of its operands.
@@ -136,9 +141,9 @@ private:
}
/// \brief Whether \p Val is divergent when read in \p ObservingBlock.
- bool isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
- const ValueTy Val,
- const MachineBasicBlock &incomingBlock) const; // AMDGPU change
+ bool isTemporalDivergent(
+ const MachineBasicBlock &ObservingBlock, const ValueTy Val,
+ const MachineBasicBlock &incomingBlock) const; // AMDGPU change
/// \brief Whether \p Block is join divergent
///
@@ -207,14 +212,14 @@ private:
// Set of known-uniform values.
llvm::DenseSet<unsigned> UniformOverrides;
- llvm::DenseSet<const llvm::MachineInstr*> UniformOverridesInsts;
+ llvm::DenseSet<const llvm::MachineInstr *> UniformOverridesInsts;
// Blocks with joining divergent control from different predecessors.
llvm::DenseSet<const MachineBasicBlock *> DivergentJoinBlocks;
// Detected/marked divergent values.
llvm::DenseSet<unsigned> DivergentValues;
- llvm::DenseSet<const llvm::MachineInstr*> DivergentInsts;
+ llvm::DenseSet<const llvm::MachineInstr *> DivergentInsts;
// Mir change for EXEC control flow.
// Map from MBB to the exec region it belongs too.
@@ -226,16 +231,15 @@ private:
struct ExecRegion {
const llvm::MachineInstr *begin;
const llvm::MachineInstr *end;
- std::vector<const llvm::MachineBasicBlock*> blocks;
+ std::vector<const llvm::MachineBasicBlock *> blocks;
bool bPropagated = false;
- ExecRegion(const llvm::MachineInstr *b,
- const llvm::MachineInstr *e)
+ ExecRegion(const llvm::MachineInstr *b, const llvm::MachineInstr *e)
: begin(b), end(e), bPropagated(false) {}
};
llvm::DenseMap<const llvm::MachineBasicBlock *, ExecRegion *> ExecRegionMap;
// Internal worklist for divergence propagation.
- std::vector<const llvm::MachineInstr*> Worklist;
+ std::vector<const llvm::MachineInstr *> Worklist;
};
/// \brief Divergence analysis frontend for GPU kernels.
@@ -251,15 +255,17 @@ class MirGPUDivergenceAnalysis {
// When A is divergent branch, B and C are divergent join at D.
// Then DivergentJoinMap[B].count(C) > 0 and
// DivergentJoinMap[C].count(B) > 0.
- DivergentJoinMapTy DivergentJoinMap;
+ DivergentJoinMapTy DivergentJoinMap;
// AMDGPU change end
SyncDependenceAnalysis SDA;
DivergenceAnalysis DA;
public:
/// Runs the divergence analysis on @F, a GPU kernel
- MirGPUDivergenceAnalysis(llvm::MachineFunction &F, const MachineDominatorTree &DT,
- const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI);
+ MirGPUDivergenceAnalysis(llvm::MachineFunction &F,
+ const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT,
+ const MachineLoopInfo &LI);
/// Whether any divergence was detected.
bool hasDivergence() const { return DA.hasDetectedDivergence(); }
@@ -278,4 +284,3 @@ public:
};
} // namespace llvm
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
index 7213f7b4b1..302939c76a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
@@ -1,4 +1,5 @@
-//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence Calculation
+//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence
+//Calculation
//--===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -101,15 +102,15 @@
// loop exit and the loop header (_after_ SSA construction).
//
//===----------------------------------------------------------------------===//
+#include "AMDGPUMirSyncDependenceAnalysis.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "AMDGPUMirSyncDependenceAnalysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include <stack>
#include <unordered_set>
@@ -120,19 +121,18 @@ namespace llvm {
ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
-SyncDependenceAnalysis::SyncDependenceAnalysis(const MachineDominatorTree &DT,
- const MachinePostDominatorTree &PDT,
- const MachineLoopInfo &LI,
- // AMDGPU change begin.
- DivergentJoinMapTy &JoinMap
- // AMDGPU change end.
+SyncDependenceAnalysis::SyncDependenceAnalysis(
+ const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+ const MachineLoopInfo &LI,
+ // AMDGPU change begin.
+ DivergentJoinMapTy &JoinMap
+ // AMDGPU change end.
)
: FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI),
- // AMDGPU change begin.
+ // AMDGPU change begin.
DivergentJoinMap(JoinMap)
- // AMDGPU change end.
-{
-}
+// AMDGPU change end.
+{}
SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
@@ -155,19 +155,23 @@ struct DivergencePropagator {
// if DefMap[B] ~ undef then we haven't seen B yet
// if DefMap[B] == B then B is a join point of disjoint paths from X or B is
// an immediate successor of X (initial value).
- using DefiningBlockMap = std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
+ using DefiningBlockMap =
+ std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
DefiningBlockMap DefMap;
// all blocks with pending visits
std::unordered_set<const MachineBasicBlock *> PendingUpdates;
- DivergencePropagator(const FunctionRPOT &FuncRPOT, const MachineDominatorTree &DT,
- const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+ DivergencePropagator(const FunctionRPOT &FuncRPOT,
+ const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT,
+ const MachineLoopInfo &LI)
: FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
JoinBlocks(new ConstBlockSet) {}
// set the definition at @block and mark @block as pending for a visit
- void addPending(const MachineBasicBlock &Block, const MachineBasicBlock &DefBlock) {
+ void addPending(const MachineBasicBlock &Block,
+ const MachineBasicBlock &DefBlock) {
bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
if (WasAdded)
PendingUpdates.insert(&Block);
@@ -190,7 +194,8 @@ struct DivergencePropagator {
// process @succBlock with reaching definition @defBlock
// the original divergent branch was in @parentLoop (if any)
- void visitSuccessor(const MachineBasicBlock &SuccBlock, const MachineLoop *ParentLoop,
+ void visitSuccessor(const MachineBasicBlock &SuccBlock,
+ const MachineLoop *ParentLoop,
const MachineBasicBlock &DefBlock) {
// @succBlock is a loop exit
@@ -223,14 +228,14 @@ struct DivergencePropagator {
// divergent exits.
// @rootBlock is either the block containing the branch or the header of the
// divergent loop.
- // @nodeSuccessors is the set of successors of the node (MachineLoop or Terminator)
- // headed by @rootBlock.
- // @parentLoop is the parent loop of the MachineLoop or the loop that contains the
- // Terminator.
+ // @nodeSuccessors is the set of successors of the node (MachineLoop or
+ // Terminator) headed by @rootBlock.
+ // @parentLoop is the parent loop of the MachineLoop or the loop that contains
+ // the Terminator.
template <typename SuccessorIterable>
- std::unique_ptr<ConstBlockSet>
- computeJoinPoints(const MachineBasicBlock &RootBlock,
- SuccessorIterable NodeSuccessors, const MachineLoop *ParentLoop, const MachineBasicBlock * PdBoundBlock) {
+ std::unique_ptr<ConstBlockSet> computeJoinPoints(
+ const MachineBasicBlock &RootBlock, SuccessorIterable NodeSuccessors,
+ const MachineLoop *ParentLoop, const MachineBasicBlock *PdBoundBlock) {
assert(JoinBlocks);
// bootstrap with branch targets
@@ -250,7 +255,8 @@ struct DivergencePropagator {
auto ItBeginRPO = FuncRPOT.begin();
// skip until term (TODO RPOT won't let us start at @term directly)
- for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
+ for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {
+ }
auto ItEndRPO = FuncRPOT.end();
assert(ItBeginRPO != ItEndRPO);
@@ -337,30 +343,26 @@ struct DivergencePropagator {
// | B C
// | | / |
// +--L P
- //
+ //
// In this cfg, C is the RootBlock and P is C's post-dominator.
// It will only visit L and P and then stop because it hits the
// post dominator. Most loops do not hit this case because the
// loop exiting block (C) will branch directly back to the loop
// header.
- //
- if (HeaderDefBlock)
- {
- for (const auto *ExitBlock : ReachedLoopExits) {
- auto ItExitDef = DefMap.find(ExitBlock);
- assert((ItExitDef != DefMap.end()) &&
- "no reaching def at reachable loop exit");
- if (ItExitDef->second != HeaderDefBlock) {
- JoinBlocks->insert(ExitBlock);
- }
- }
- }
- else
- {
- for (const auto *ExitBlock : ReachedLoopExits)
- {
- JoinBlocks->insert(ExitBlock);
+ //
+ if (HeaderDefBlock) {
+ for (const auto *ExitBlock : ReachedLoopExits) {
+ auto ItExitDef = DefMap.find(ExitBlock);
+ assert((ItExitDef != DefMap.end()) &&
+ "no reaching def at reachable loop exit");
+ if (ItExitDef->second != HeaderDefBlock) {
+ JoinBlocks->insert(ExitBlock);
}
+ }
+ } else {
+ for (const auto *ExitBlock : ReachedLoopExits) {
+ JoinBlocks->insert(ExitBlock);
+ }
}
}
@@ -370,12 +372,14 @@ struct DivergencePropagator {
// AMDGPU change begin.
// For all join blocks caused by divergent RootBlock, the prevs of a join block
-// which are in DefMap or the RootBlock are divergent join each other on the join block because
-// of divergent RootBlock.
-static void updateJoinMap(
- const MachineBasicBlock *RootBlock,
- DenseMap<const MachineBasicBlock *, SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
- DivergencePropagator::DefiningBlockMap &DefMap, ConstBlockSet &JoinBlocks) {
+// which are in DefMap or the RootBlock are divergent join each other on the
+// join block because of divergent RootBlock.
+static void
+updateJoinMap(const MachineBasicBlock *RootBlock,
+ DenseMap<const MachineBasicBlock *,
+ SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
+ DivergencePropagator::DefiningBlockMap &DefMap,
+ ConstBlockSet &JoinBlocks) {
for (const MachineBasicBlock *JoinBB : JoinBlocks) {
// makr divergent join for all pred pair which in DefMap.
for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end();
@@ -400,7 +404,8 @@ static void updateJoinMap(
}
// AMDGPU change end.
-const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
using LoopExitVec = SmallVector<MachineBasicBlock *, 4>;
LoopExitVec LoopExits;
MachineLoop.getExitBlocks(LoopExits);
@@ -415,7 +420,8 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &Mach
}
// dont propagte beyond the immediate post dom of the loop
- const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
+ const auto *PdNode =
+ PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
const auto *IpdNode = PdNode->getIDom();
const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) {
@@ -426,15 +432,17 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &Mach
// compute all join points
DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
- *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), PdBoundBlock);
+ *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(),
+ PdBoundBlock);
// AMDGPU change begin.
// Save divergent join pairs.
updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap,
- *JoinBlocks.get());
+ *JoinBlocks.get());
// AMDGPU change end.
- auto ItInserted = CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
+ auto ItInserted =
+ CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
assert(ItInserted.second);
return *ItInserted.first->second;
}
@@ -452,18 +460,18 @@ SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
return *ItCached->second;
// dont propagate beyond the immediate post dominator of the branch
- const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
+ const auto *PdNode =
+ PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
const auto *IpdNode = PdNode->getIDom();
const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-
// compute all join points
DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
const auto &TermBlock = *Term.getParent();
-
+
// AMDGPU CHANGE
// Make sure the post-dominator is outside the loop for the loop header.
- // Otherwise, we may not find all the join blocks in the loop
+ // Otherwise, we may not find all the join blocks in the loop
// because the search stops too early. Some join points can be reached
// after the post-dominator!
//
@@ -477,30 +485,30 @@ SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
//
// In this cfg, A is the loop header and P is A's post-dominator.
// The algorithm to mark join points does an Reverse Post Order walk
- // from A and stops when it reaches the post dominator. It would not
+ // from A and stops when it reaches the post dominator. It would not
// mark the phi node in L as divergent even when A had a divergent branch.
// The fix we made was to make the join point search continue all the way
// to the loops post dominator (which is X in this example).
//
// NOTE: They already made this change for the loop case above, but for
- // a different bug apparently. See SyncDependenceAnalysis::join_blocks(MachineLoop&)
- //
+ // a different bug apparently. See
+ // SyncDependenceAnalysis::join_blocks(MachineLoop&)
+ //
const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock);
- if (MachineLoop && (MachineLoop->getHeader() == &TermBlock))
- {
- while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
- IpdNode = IpdNode->getIDom();
- PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
- }
+ if (MachineLoop && (MachineLoop->getHeader() == &TermBlock)) {
+ while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
+ IpdNode = IpdNode->getIDom();
+ PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+ }
}
-
+
auto JoinBlocks = Propagator.computeJoinPoints(
TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock);
// AMDGPU change begin.
// Save divergent join pairs.
updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap,
- *JoinBlocks.get());
+ *JoinBlocks.get());
// AMDGPU change end.
auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
index a52bcc7bc9..321fcf5e6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
@@ -1,4 +1,5 @@
-//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -*-===//
+//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++
+//-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -20,8 +21,8 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include <memory>
#include <map>
+#include <memory>
namespace llvm {
class MachineBasicBlock;
@@ -44,14 +45,16 @@ using ConstBlockSet = llvm::SmallPtrSet<const MachineBasicBlock *, 4>;
/// This analysis relates points of divergent control to points of converging
/// divergent control. The analysis requires all loops to be reducible.
class SyncDependenceAnalysis {
- void visitSuccessor(const MachineBasicBlock &succBlock, const MachineLoop *termLoop,
+ void visitSuccessor(const MachineBasicBlock &succBlock,
+ const MachineLoop *termLoop,
const MachineBasicBlock *defBlock);
public:
bool inRegion(const MachineBasicBlock &BB) const;
~SyncDependenceAnalysis();
- SyncDependenceAnalysis(const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+ SyncDependenceAnalysis(const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT,
const MachineLoopInfo &LI,
// AMDGPU change begin
DivergentJoinMapTy &JoinMap
@@ -88,11 +91,10 @@ private:
// AMDGPU change begin.
DivergentJoinMapTy &DivergentJoinMap;
// AMDGPU change end.
- std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+ std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>>
+ CachedLoopExitJoins;
std::map<const MachineInstr *, std::unique_ptr<ConstBlockSet>>
CachedBranchJoins;
};
} // namespace llvm
-
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index 648df7f724..49a8e4f076 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -1,4 +1,5 @@
-//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and
+//latency --===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,11 +13,11 @@
//
//===--------------------------------------------------------------------------------===//
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
#include "AMDGPUSubtarget.h"
#include "GCNSubtarget.h"
-#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
@@ -57,7 +58,7 @@ bool SchedScore::isBetter(const SchedScore &s) const {
bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
unsigned gain = latencyGain(TargetOccupancy, ExtraOcc);
// 10% is good enough.
- if ((10*gain) >= Alu)
+ if ((10 * gain) >= Alu)
return true;
else
return false;
@@ -65,7 +66,7 @@ bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
unsigned latency = MemLatency;
- return (latency / (TgtOcc))- (latency / (TgtOcc + ExtraOcc));
+ return (latency / (TgtOcc)) - (latency / (TgtOcc + ExtraOcc));
}
// AMDGPULatencyTracker
@@ -73,7 +74,8 @@ AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
: SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
- if (MI.isDebugInstr()) return;
+ if (MI.isDebugInstr())
+ return;
int latency = SIII->getInstrLatency(ItinerayData, MI);
// If inside latency hide.
if (!LatencyMIs.empty()) {
@@ -184,5 +186,3 @@ SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
}
} // namespace llvm
-
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index f108bab24b..7444f63845 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -1,4 +1,5 @@
-//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and
+//latency --===//
//
// The LLVM Compiler Infrastructure
//
@@ -30,7 +31,7 @@ struct SchedScore {
unsigned MemLatency = 0; // Only save mem latency.
// We want mem latency small and hide big. Compare
// memLatency - hide * Occ, smaller is better.
- unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1.
+ unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1.
unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
unsigned Lds = 0; // Todo: count lds.
SchedScore() {}
@@ -39,9 +40,9 @@ struct SchedScore {
float computeScore() const;
float computeScore2() const;
- void sum(const SchedScore &s, unsigned loopDepth=0);
+ void sum(const SchedScore &s, unsigned loopDepth = 0);
bool isBetter(const SchedScore &s) const;
- bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc=1) const;
+ bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
// More latency can be hiden with ExtraOcc.
unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
};
@@ -71,4 +72,4 @@ struct AMDGPULatencyTracker {
SchedScore CollectLatency(llvm::MachineFunction &MF,
const llvm::GCNSubtarget &ST,
const llvm::MachineLoopInfo *MLI = nullptr);
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index a0f2a5d4dc..6f2200d8f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -1,9 +1,9 @@
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/SlotIndexes.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
-//#include "dxc/DXIL/DxilMetadataHelper.h"
+// #include "dxc/DXIL/DxilMetadataHelper.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/raw_ostream.h"
@@ -14,9 +14,9 @@
#include "llvm/Support/Debug.h"
-#include "GCNRegPressure.h"
#include "AMDGPUMIRUtils.h"
#include "AMDGPUSubExpDag.h"
+#include "GCNRegPressure.h"
#include <unordered_set>
#define DEBUG_TYPE "xb-sub-exp-dag"
@@ -27,37 +27,35 @@ namespace llvm {
// Expression Dag.
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const {
- dbgs() << "\nSubExp:\n";
- dbgs() << "input regs:\n";
- for (auto &input : inputLive) {
- pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
- dbgs() << "\n";
- }
- dbgs() << "output regs:\n";
- for (auto &output : outputLive) {
- pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
- dbgs() << "\n";
- }
+void SubExp::dump(const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) const {
+ dbgs() << "\nSubExp:\n";
+ dbgs() << "input regs:\n";
+ for (auto &input : inputLive) {
+ pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
+ dbgs() << "\n";
+ }
+ dbgs() << "output regs:\n";
+ for (auto &output : outputLive) {
+ pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
+ dbgs() << "\n";
+ }
- for (MachineInstr *MI : SUnits) {
- MI->dump();
- }
- dbgs() << "End of SubExp\n";
+ for (MachineInstr *MI : SUnits) {
+ MI->dump();
+ }
+ dbgs() << "End of SubExp\n";
}
#endif
-bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo* SIRI) const
-{
- for (const MachineInstr *MI : SUnits)
- {
- if (MI->modifiesRegister(Reg, SIRI))
- {
- return true;
- }
+bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const {
+ for (const MachineInstr *MI : SUnits) {
+ if (MI->modifiesRegister(Reg, SIRI)) {
+ return true;
}
+ }
- return false;
+ return false;
}
void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
@@ -95,7 +93,9 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) {
MachineInstr *MI = *it;
- auto *ST = &MI->getMF()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+ auto *ST =
+ &MI->getMF()
+ ->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
for (MachineOperand &MO : MI->operands()) {
if (!MO.isReg())
continue;
@@ -149,8 +149,8 @@ bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const {
}
ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI,
- const llvm::SIRegisterInfo *SIRI,
- const SIInstrInfo *SIII, const bool bJoinInput)
+ const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ const bool bJoinInput)
: MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {}
template <typename T>
@@ -196,9 +196,9 @@ template void
ExpDag::build<DenseSet<MachineInstr *>>(const LiveSet &InputLiveReg,
const LiveSet &OutputLiveReg,
DenseSet<MachineInstr *> &instRange);
-template void ExpDag::build<std::vector<MachineInstr *>>(const LiveSet &InputLiveReg,
- const LiveSet &OutputLiveReg,
- std::vector<MachineInstr *> &instRange);
+template void ExpDag::build<std::vector<MachineInstr *>>(
+ const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+ std::vector<MachineInstr *> &instRange);
void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
@@ -311,7 +311,8 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
// UserMI should always be in same subExp.
unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum];
if (UseSubIdx != OriginSubIdx) {
- // When reg has multiple def, it is possible for user def in different subExp.
+ // When reg has multiple def, it is possible for user def in
+ // different subExp.
if (MRI.getUniqueVRegDef(Reg))
llvm::report_fatal_error("user and def in different subExp");
break;
@@ -470,9 +471,8 @@ void BlockExpDag::buildWithPressure() {
buildPressure(StartLiveReg, EndLiveReg);
}
-void BlockExpDag::buildAvail(
- const LiveSet &passThruSet,
- DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
+void BlockExpDag::buildAvail(const LiveSet &passThruSet,
+ DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
DenseSet<SUnit *> Processed;
DenseSet<SUnit *> WorkList;
@@ -596,10 +596,10 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
// Using pass thru as base because output of current SU should not
// affect other output SUs.
GCNUpwardRPTracker RP(*LIS);
- RP.reset(BeginMI, &passThruSet, /*After*/true);
+ RP.reset(BeginMI, &passThruSet, /*After*/ true);
MachineInstr *MI = SU.getInstr();
if (MI) {
- RP.reset(*MI, &passThruSet, /*After*/true);
+ RP.reset(*MI, &passThruSet, /*After*/ true);
RP.recede(*MI);
}
DagPressureMap[&SU] = RP.getLiveRegs();
@@ -639,9 +639,9 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU];
GCNUpwardRPTracker RP(*LIS);
- RP.reset(BeginMI, &SuccLive, /*After*/true);
+ RP.reset(BeginMI, &SuccLive, /*After*/ true);
if (MI) {
- RP.reset(*MI, &SuccLive, /*After*/true);
+ RP.reset(*MI, &SuccLive, /*After*/ true);
// Update SuccLive based on MI.
RP.recede(*MI);
}
@@ -684,9 +684,7 @@ std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const {
}
/// Return the label.
-std::string ExpDag::getDAGName() const {
- return "dag.exp";
-}
+std::string ExpDag::getDAGName() const { return "dag.exp"; }
/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
/// rendered using 'dot'.
@@ -707,7 +705,7 @@ void ExpDag::dump() {
viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName());
}
-}
+} // namespace llvm
// Expression Dag dump.
namespace llvm {
@@ -757,7 +755,8 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
SS << "SU:" << SU->NodeNum;
return SS.str();
}
- static std::string getNodeDescription(const SUnit *SU, const llvm::ExpDag *G) {
+ static std::string getNodeDescription(const SUnit *SU,
+ const llvm::ExpDag *G) {
return G->getGraphNodeLabel(SU);
}
static std::string getNodeAttributes(const SUnit *N,
@@ -804,7 +803,9 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
const GCNRPTracker::LiveRegSet outputLive =
llvm::getLiveRegs(EndSlot, *LIS, MRI);
- auto* ST = &MBB->getParent()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+ auto *ST =
+ &MBB->getParent()
+ ->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
if (MBB->empty()) {
GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive);
MaxSGPR = MaxPressure.getSGPRNum();
@@ -845,7 +846,7 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI);
GCNUpwardRPTracker RPTracker(*LIS);
- RPTracker.reset(MBB->front(), &outputLive, /*After*/true);
+ RPTracker.reset(MBB->front(), &outputLive, /*After*/ true);
for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) {
const SUnit *SU = *it;
if (!SU->isInstr())
@@ -1038,8 +1039,7 @@ void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
}
LLVM_DEBUG(
- dbgs() << "Chained Nodes:"; for (SUnit *SU
- : ChainedNodes) {
+ dbgs() << "Chained Nodes:"; for (SUnit *SU : ChainedNodes) {
dbgs() << " " << SU->NodeNum << "\n";
} for (int i = 0; i < Lineages.size(); i++) {
dbgs() << "Lineage" << i << ":";
@@ -1116,8 +1116,7 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
return Heir;
}
-HRB::Lineage HRB::buildChain(SUnit *Node,
- std::vector<llvm::SUnit> &SUnits) {
+HRB::Lineage HRB::buildChain(SUnit *Node, std::vector<llvm::SUnit> &SUnits) {
HRB::Lineage chain;
chain.addNode(Node);
ChainedNodes.insert(Node);
@@ -1241,8 +1240,7 @@ void HRB::buildReachRelation(ArrayRef<SUnit *> BotRoots) {
}
ReachMap.erase(&FakeEntry);
- LLVM_DEBUG(for (Lineage &L
- : Lineages) {
+ LLVM_DEBUG(for (Lineage &L : Lineages) {
for (SUnit *SU : L.Nodes) {
DenseSet<SUnit *> &CurReach = ReachMap[SU];
dbgs() << SU->NodeNum << " reach: ";
@@ -1703,8 +1701,7 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
return confA > confB;
});
- LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU
- : ReadyList) {
+ LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU : ReadyList) {
dbgs() << " " << SU->NodeNum;
} dbgs() << "\n";);
SUnit *Candidate = nullptr;
@@ -1754,7 +1751,7 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
SUnit *SU = *it;
if (!Color.isHead(SU)) {
- continue;
+ continue;
}
Candidate = SU;
// Remove Candidate from ReadyList.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
index c234f32370..a7d29430b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -4,7 +4,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/MC/LaneBitmask.h"
-#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
+#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
namespace llvm {
class MachineFunction;
@@ -14,8 +14,7 @@ class SIRegisterInfo;
class SIInstrInfo;
class MachineInstr;
class MachineBasicBlock;
-template<typename GraphType>
-class GraphWriter;
+template <typename GraphType> class GraphWriter;
class SUnit;
class IntEqClasses;
class Twine;
@@ -55,13 +54,12 @@ struct SubExp {
const llvm::SIRegisterInfo *SIRI);
void dump(const llvm::MachineRegisterInfo &MRI,
const llvm::SIRegisterInfo *SIRI) const;
- bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo* SIRI) const;
+ bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo *SIRI) const;
};
struct ExpDag {
ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
- const llvm::SIInstrInfo *SIII,
- const bool bJoinInput);
+ const llvm::SIInstrInfo *SIII, const bool bJoinInput);
const llvm::MachineRegisterInfo &MRI;
const llvm::SIRegisterInfo *SIRI;
const llvm::SIInstrInfo *SIII;
@@ -83,13 +81,14 @@ struct ExpDag {
std::string getDAGName() const;
/// Adds custom features for a visualization of the ScheduleDAG.
void addCustomGraphFeatures(llvm::GraphWriter<ExpDag *> &) const {}
+
private:
- template<typename T>
- void initNodes(const LiveSet &InputLiveReg, T &insts);
+ template <typename T> void initNodes(const LiveSet &InputLiveReg, T &insts);
void addDataDep(const llvm::SIRegisterInfo *SIRI);
void addCtrlDep();
void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
- const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+ const llvm::SIRegisterInfo *SIRI,
+ const llvm::SIInstrInfo *SIII);
};
struct BlockExpDag : public ExpDag {
@@ -103,11 +102,11 @@ struct BlockExpDag : public ExpDag {
std::vector<SubExp> SubExps;
void build();
void buildWithPressure();
+
private:
void buildAvail(const LiveSet &passThruSet,
llvm::DenseMap<llvm::SUnit *, LiveSet> &DagAvailRegMap);
- void buildPressure(const LiveSet &StartLiveReg,
- const LiveSet &EndLiveReg);
+ void buildPressure(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg);
};
void getRegBound(llvm::MachineBasicBlock *MBB,
@@ -194,4 +193,4 @@ std::vector<const llvm::SUnit *> hrbSched(std::vector<llvm::SUnit> &SUnits,
const llvm::MachineRegisterInfo &MRI,
const llvm::SIRegisterInfo *SIRI);
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
index c9172bae2c..09f1d8dfa4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
@@ -1,4 +1,5 @@
-//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG --------------===//
+//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG
+//--------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,9 +15,9 @@
//===----------------------------------------------------------------------===//
#pragma once
-#include <vector>
#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
+#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
+#include <vector>
namespace llvm {
class MachineBasicBlock;
@@ -42,7 +43,6 @@ private:
void addCtrlDep();
};
-
// Collect height/depth for high latency mem ld, which only update height/depth
// when cross high latency mem ld. Call the height/depth as VMem degree here.
// The rule is sample and its user should has different degree.
@@ -60,15 +60,13 @@ private:
class VMemDegreeDAG {
public:
- VMemDegreeDAG(std::vector<llvm::SUnit> &Units,
- const llvm::SIInstrInfo *TII)
+ VMemDegreeDAG(std::vector<llvm::SUnit> &Units, const llvm::SIInstrInfo *TII)
: SUnits(Units), SIII(TII) {}
std::vector<llvm::SUnit> &SUnits;
// InstrInfo.
const llvm::SIInstrInfo *SIII;
void build();
-
bool isHighLatency(const llvm::SUnit *SU) const;
bool isHighLatency(const llvm::MachineInstr *MI) const;
// height/depth based on Long latency inst.
@@ -79,28 +77,24 @@ public:
std::vector<unsigned> VMemFullDepth;
llvm::SmallVector<llvm::SUnit *, 16> VMemSUs;
llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUs;
- llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUsByDepth;
-
+ llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16>
+ GroupedVMemSUsByDepth;
void dump();
private:
static constexpr unsigned kNoReg = -1;
-
- std::pair<unsigned, unsigned> buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
- std::vector<unsigned> &VMemDepth, bool bDataOnly);
+ std::pair<unsigned, unsigned>
+ buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
+ std::vector<unsigned> &VMemDepth, bool bDataOnly);
// Compute vmem height/depth.
void buildVMemDepthHeight();
void buildVMemDataDepthHeight();
void groupVmemSUnits();
-
};
-
-
// Split block based on vmem depth.
void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag);
-}
-
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index cb10df2c34..8debda9032 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1313,7 +1313,7 @@ public:
bool isLowLatencyInstruction(const MachineInstr &MI) const;
bool isHighLatencyDef(int Opc) const override;
- bool isHighLatencyInstruction(const MachineInstr& MI) const {
+ bool isHighLatencyInstruction(const MachineInstr &MI) const {
return isHighLatencyDef(MI.getOpcode());
}
``````````
</details>
https://github.com/llvm/llvm-project/pull/126331
More information about the llvm-commits
mailing list