[llvm] [AMDGPU] Added hot-block-rematerialize pass (PR #136631)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu May 1 05:10:39 PDT 2025
================
@@ -0,0 +1,657 @@
+//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMIRUtils.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "xb-mir-util"
+using namespace llvm;
+
+namespace llvm {
+bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
+ MachineBasicBlock &MBB) {
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ while (BBEnd != MBB.rend() && BBEnd->isDebugInstr())
+ BBEnd++;
+ return BBEnd != MBB.rend();
+}
+} // namespace llvm
+
+namespace {
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+ MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
+ MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
+ // Treat non inst as not local.
+ if (!StartMI || !EndMI)
+ return false;
+ // is local when parent MBB the same.
+ bool IsSameMBB = StartMI->getParent() == EndMI->getParent();
+ if (!IsSameMBB)
+ return false;
+ // Collect touched MBB.
+ MachineBasicBlock *MBB = StartMI->getParent();
+ TouchedMBBSet.insert(MBB);
+ return true;
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+ for (const LiveRange::Segment &Seg : Range->segments) {
+ if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet))
+ return false;
+ }
+ return true;
+}
+
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) {
+ MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
+ MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
+ // Treat non inst as not local.
+ if (!StartMI || !EndMI)
+ return false;
+ // is local when parent MBB the same.
+ return StartMI->getParent() == EndMI->getParent();
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
+ for (const LiveRange::Segment &Seg : Range->segments) {
+ if (!isLocalSegment(&Seg, Indexes))
+ return false;
+ }
+ return true;
+}
+
+// LoopInfo contains a mapping from basic block to the innermost loop. Find
+// the outermost loop in the loop nest that contains BB.
+const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI,
+ const MachineBasicBlock *BB) {
+ const MachineLoop *L = LI->getLoopFor(BB);
+ if (L) {
+ while (const MachineLoop *Parent = L->getParentLoop())
+ L = Parent;
+ }
+ return L;
+}
+
+bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
+ const MachineBasicBlock *BB2) {
+ const MachineLoop *L1 = getOutermostLoop(LI, BB1);
+ const MachineLoop *L2 = getOutermostLoop(LI, BB2);
+ return L1 != nullptr && L1 == L2;
+}
+
+} // namespace
+
+namespace llvm {
+
+bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
+ llvm::MachineBasicBlock::iterator MI) {
+ const TargetRegisterInfo *TRI =
+ MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+ for (auto It = MI; It != MBB->end(); ++It) {
+ const MachineInstr &CurMI = *It;
+ // Hit use of scc, it is live.
+ if (CurMI.readsRegister(AMDGPU::SCC, TRI))
+ return true;
+ // Hit def of scc first, not live.
+ if (CurMI.definesRegister(AMDGPU::SCC, TRI))
+ return false;
+ }
+ // Reach the end of MBB, check live-ins of MBB successors.
+ for (const MachineBasicBlock *Succ : MBB->successors()) {
+ if (Succ->isLiveIn(AMDGPU::SCC))
+ return true;
+ }
+ return false;
+}
+
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
+ MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
+ const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
+ MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
+ // If SCC is dead at MI when we can use MI as the insert point.
+ if (!llvm::isSccLiveAt(MBB, MI))
+ return MI;
+
+ const bool CheckForExecWrite =
+ Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+
+ // Get the starting reverse iterator taking care to handle the MBB->end()
+ // case.
+ MachineBasicBlock::reverse_iterator Start;
+ if (MI == MBB->end())
+ Start = MBB->rbegin();
+ else
+ Start = MI.getReverse();
+
+ // Otherwise, walk backwards through the block looking for a location where
+ // SCC is dead.
+ for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend();
+ It != End; ++It) {
+ // If the instruction modifies exec then we cannot use it as
+ // an insertion point (if that is a constraint from the caller).
+ // The check for EXEC works for both wave64 and wave32 because
+ // it will also catch Writes to the subregisters (e.g. exec_lo).
+ if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
+ break;
+
+ if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
+ !It->readsRegister(AMDGPU::SCC, TRI))
+ return It->getIterator();
+ }
+
+ // If no safe location can be found in the block we can save and restore
+ // SCC around MI. There is no way to directly read or Write SCC so we use
+ // s_cselect to read the current value of SCC and s_cmp to Write the saved
+ // value back to SCC.
+ //
+ // The generated code will look like this;
+ //
+ // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC
+ // <----- Newly created safe insert point.
+ // MI
+ // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC
+ //
+ Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+ .addImm(-1)
+ .addImm(0);
+ BuildMI(*MBB, std::next(MI->getIterator()), DL,
+ TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(TmpScc, RegState::Kill)
+ .addImm(0);
+
+ return MI;
+}
+
+// In case like float4 v, v.x used and defined in one block, v.y used and define
+// in another block, one live interval could touch more than one MBB.
+// TouchedMBBSet is used for scheduling where local live interval could cross
+// multiple regions, need to calculate livereg for each region inside touched
+// MBB.
+bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges())
+ if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
+ return false;
+ }
+ return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
+}
+
+bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges())
+ if (!isLocalLiveRange(&S, Indexes))
+ return false;
+ }
+ return isLocalLiveRange(&LI, Indexes);
+}
+
+void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
+
+ dbgs() << "\n live set: \n";
+ for (auto It : LiveSet) {
+ int Reg = It.first;
+ dbgs() << printReg(Reg, SIRI);
+ if (It.second.any())
+ dbgs() << " mask:" << It.second.getAsInteger();
+ dbgs() << "\n";
+ }
+}
+
+LaneBitmask getRegMask(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) {
+ // We don't rely on read-undef_ flag because in case of tentative schedule
+ // tracking it isn't set correctly yet. This works correctly however since
+ // use mask has been tracked before using LIS.
+ return MO.getSubReg() == 0
+ ? MRI.getMaxLaneMaskForVReg(MO.getReg())
+ : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(
+ MO.getSubReg());
+}
+
+struct Piece {
+ unsigned Reg;
+ unsigned Offset;
+ unsigned Size;
+ static SmallVector<Piece, 8> split(std::bitset<32> Mask) {
+
+ SmallVector<Piece, 8> Pieces;
+ Piece Piece = {0, 0, 0};
+ for (unsigned i = 0; i < 32; i++) {
+ if (Mask.test(i)) {
+ if (Piece.Size == 0)
+ Piece.Offset = i;
+
+ Piece.Size++;
+ // Make sure no piece bigger than 8.
+ if (Piece.Size == 8) {
+ Pieces.emplace_back(Piece);
+ Piece.Size = 0;
+ }
+ } else {
+ if (Piece.Size == 0) {
+ continue;
+ }
+ Pieces.emplace_back(Piece);
+ Piece.Size = 0;
+ }
+ }
+ return Pieces;
+ }
+};
+
+static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI,
+ const MachineRegisterInfo &MRI) {
+ const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+ const TargetRegisterClass *SubregRC =
+ SIRI->getSubRegisterClass(RC, AMDGPU::sub0);
+ return SubregRC->LaneMask.getNumLanes();
+}
+
+static std::vector<unsigned>
+getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI,
----------------
arsenm wrote:
This is reinventing getCoveringSubRegIndexes?
https://github.com/llvm/llvm-project/pull/136631
More information about the llvm-commits
mailing list