[llvm] [AMDGPU] Added hot-block-rematerialize pass (PR #136631)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu May 1 05:10:40 PDT 2025
================
@@ -0,0 +1,1511 @@
+//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU hot block Rematerialize
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+#define DEBUG_TYPE "amdgpu-hot-block-remat"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
+
+namespace {
+
+typedef DenseSet<MachineInstr *> InstSet;
+typedef DenseSet<MachineBasicBlock *> BlockSet;
+template <typename T> using BlockMap = MapVector<MachineBasicBlock *, T>;
+
+struct RematNode {
+ enum class RematKind {
+ Candidate, // Not ready yet.
+ OneDefOneUse,
+ Clone,
+ };
+ RematNode()
+ : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr),
+ Kind(RematKind::Candidate), Size(0) {}
+ RematNode(unsigned R, MachineInstr *MI, unsigned S)
+ : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr),
+ Kind(RematKind::Candidate), Size(S) {}
+ unsigned Reg;
+ MachineInstr *DefMI;
+ MachineBasicBlock *InsertBlock;
+ union {
+ MachineInstr *InsertPointMI;
+ unsigned UserCount;
+ };
+ RematKind Kind;
+ unsigned Size;
+};
+
+struct BlockLiveInfo {
+ MachineBasicBlock *BB;
+ unsigned MaxSReg;
+ unsigned MaxVReg;
+ // Input live is the live reg which cross block.
+ const GCNRPTracker::LiveRegSet InputLive;
+};
+
+struct RematStatus {
+ unsigned TargetOcc;
+ unsigned TargetVLimit;
+ unsigned TargetSLimit;
+ unsigned MaxVPressure;
+ unsigned MaxSPressure;
+ unsigned InputPhysicalVPressure;
+ unsigned InputPhysicalSPressure;
+ // More occupancy can help more than latency cost to reach It.
+ bool MemBound;
+ // abs(VTargetOcc-STargetOcc) > 1.
+ bool NotBalance;
+ DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
+ // Collect MBBs which has memory write. When move instructions cross MBB, skip
+ // mem inst if the MBB has memory write. To make things fast, just check
+ // mayStore and isBarrier.
+ DenseSet<MachineBasicBlock *> MemWriteMBBSet;
+};
+
+class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
+
+public:
+ static char ID;
+
+ DenseSet<const MachineInstr *> TotalUniformInsts;
+ DenseSet<const MachineInstr *> SafeToRemoveInsts;
+ DenseSet<const MachineInstr *> DivergentInsts;
+ void removeInst(const MachineInstr *MI) {
+ TotalUniformInsts.erase(MI);
+ SafeToRemoveInsts.erase(MI);
+ DivergentInsts.erase(MI);
+ }
+
+ AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void applyCloneRemat(RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF);
+ void applyRemat(MapVector<Register, RematNode> &RematMap,
+ std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF);
+ bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, bool &IsNearTarget);
+
+ StringRef getPassName() const override { return "AMDGPU rematerialize"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+ AU.addRequired<SlotIndexesWrapperPass>();
+ AU.addRequired<LiveIntervalsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
+ MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ const bool WillSmashScc =
+ InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+ if (WillSmashScc) {
+ CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef(
+ MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+ }
+
+ return CurrentInsertPoint;
+}
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+ unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &UserBlocks,
+ DenseSet<MachineBasicBlock *> &UserMBBSet,
+ std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT) {
+ // Collect hot blocks which Exp is live in.
+ DenseSet<MachineBasicBlock *> HotBlockSet;
+ for (BlockLiveInfo &HotBlock : HotBlocks) {
+ if (HotBlock.InputLive.count(Reg)) {
+ HotBlockSet.insert(HotBlock.BB);
+ }
+ }
+
+ // For userBlocks which dominate all hotBlocks, don't need to clone because
+ // the value not cross hotBlocks when later blocks are cloned.
+ // For userBlocks which dominated by all hotBlocks, they could share clones
+ // because once after hot block, the pressure is OK.
+ DenseSet<MachineBasicBlock *> AfterHotRangeMBBs;
+ for (MachineBasicBlock *MBB : UserMBBSet) {
+ // Always clone in hot block.
+ if (HotBlockSet.count(MBB))
+ continue;
+
+ bool IsDomAllHotBlocks = true;
+ bool IsDomedByAllHotBlocks = true;
+ for (MachineBasicBlock *HotMBB : HotBlockSet) {
+ if (!DT->dominates(MBB, HotMBB))
+ IsDomAllHotBlocks = false;
+ if (!DT->dominates(HotMBB, MBB))
+ IsDomedByAllHotBlocks = false;
+ if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks)
+ break;
+ }
+ if (IsDomAllHotBlocks)
+ UserBlocks.erase(MBB);
+ else if (IsDomedByAllHotBlocks)
+ AfterHotRangeMBBs.insert(MBB);
+ }
+
+ // Split after hotRange block set by domtree.
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+ if (!AfterHotRangeMBBs.empty()) {
+ for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+ for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) {
+ if (MBB == MBB2)
+ continue;
+ if (DT->dominates(MBB, MBB2)) {
+ auto &Dom = DomMap[MBB];
+ Dom.insert(MBB2);
+ auto &Dom2 = DomMap[MBB2];
+ Dom.insert(Dom2.begin(), Dom2.end());
+ }
+ }
+ }
+ for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+ auto &Dom = DomMap[MBB];
+ for (MachineBasicBlock *DomedMBB : Dom) {
+ // Remove domedMBB.
+ DomMap.erase(DomedMBB);
+ UserMBBSet.erase(DomedMBB);
+ }
+ }
+ }
+
+ return DomMap;
+}
+
+void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
+ SmallVector<MachineInstr *, 2> &UserMIs) {
+ for (MachineInstr *UseMI : UserMIs) {
+ for (MachineOperand &MO : UseMI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == Reg) {
+ MO.setReg(NewReg);
+ if (IsSubRegDef)
+ MO.setSubReg(0);
+ }
+ }
+ }
+}
+
+void AMDGPUHotBlockRematerialize::applyCloneRemat(
+ RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF) {
+ unsigned Reg = Node.Reg;
+
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ auto DefOp = DefMI->getOperand(0);
+ const MCInstrDesc &Desc = DefMI->getDesc();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ // When the unique def has subReg, just create newReg for the subReg part.
+ bool IsSubRegDef = false;
+ if (DefOp.getSubReg() != 0) {
+ RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
----------------
arsenm wrote:
Recreating getOpRegClass?
https://github.com/llvm/llvm-project/pull/136631
More information about the llvm-commits
mailing list