[llvm] [AMDGPU] Added hot-block-rematerialize pass (PR #136631)
Adam Yang via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 22 11:54:44 PDT 2025
https://github.com/adam-yang updated https://github.com/llvm/llvm-project/pull/136631
>From a9464fadec85393f0344cba9c9e94b125f170445 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Fri, 18 Apr 2025 11:14:14 -0700
Subject: [PATCH 1/6] Adding remat piece by piece
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +
.../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 1303 +++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 217 +++
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 62 +
.../AMDGPUOccupancyAndLatencyHelper.cpp | 18 +
.../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 53 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 11 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 3 +
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 4 +
9 files changed, 1675 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4ff761ec19b3c..1ba8e3e2a54d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -530,6 +530,10 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &);
+FunctionPass *createAMDGPUHotBlockRematerializePass();
+extern char &AMDGPUHotBlockRematerializeID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
new file mode 100644
index 0000000000000..70b25beeb22b9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -0,0 +1,1303 @@
+//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU hot block Rematerialize
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "AMDGPU.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "GCNRegPressure.h"
+
+#define DEBUG_TYPE "amdgpu-hot-block-remat"
+
+using namespace llvm;
+
+static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
+
+namespace {
+
+typedef DenseSet<MachineInstr *> InstSet;
+typedef DenseSet<MachineBasicBlock *> BlockSet;
+
+struct RematNode {
+ enum class RematKind {
+ Candidate, // Not ready yet.
+ OneDefOneUse,
+ Clone,
+ };
+ RematNode()
+ : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr),
+ Kind(RematKind::Candidate), Size(0) {}
+ RematNode(unsigned R, MachineInstr *MI, unsigned S)
+ : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr),
+ Kind(RematKind::Candidate), Size(S) {}
+ unsigned Reg;
+ MachineInstr *DefMI;
+ MachineBasicBlock *InsertBlock;
+ union {
+ MachineInstr *InsertPointMI;
+ unsigned UserCount;
+ };
+ RematKind Kind;
+ unsigned Size;
+};
+
+struct BlockLiveInfo {
+ MachineBasicBlock *BB;
+ unsigned MaxSReg;
+ unsigned MaxVReg;
+ // Input live is the live reg which cross block.
+ const GCNRPTracker::LiveRegSet InputLive;
+};
+
+struct RematStatus {
+ unsigned TargetOcc;
+ unsigned TargetVLimit;
+ unsigned TargetSLimit;
+ unsigned MaxVPressure;
+ unsigned MaxSPressure;
+ unsigned InputPhysicalVPressure;
+ unsigned InputPhysicalSPressure;
+ // More occupancy can help more than latency cost to reach It.
+ bool MemBound;
+ // abs(VTargetOcc-STargetOcc) > 1.
+ bool NotBalance;
+ DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
+ // Collect MBBs which has memory write. When move instructions cross MBB, skip
+ // mem inst if the MBB has memory write. To make things fast, just check
+ // mayStore and isBarrier.
+ DenseSet<MachineBasicBlock *> MemWriteMBBSet;
+};
+
+class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
+
+public:
+ static char ID;
+
+ DenseSet<const MachineInstr *> TotalUniformInsts;
+ DenseSet<const MachineInstr *> SafeToRemoveInsts;
+ DenseSet<const MachineInstr *> DivergentInsts;
+ void removeInst(const MachineInstr *MI) {
+ TotalUniformInsts.erase(MI);
+ SafeToRemoveInsts.erase(MI);
+ DivergentInsts.erase(MI);
+ }
+
+ AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void applyRemat(MapVector<Register, RematNode> &RematMap,
+ std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+ llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ MachineFunction &MF);
+ void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+ llvm::SlotIndexes *SlotIndexes,
+ const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII);
+ void applyCloneRemat(RematNode &Node,
+ std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ llvm::SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF);
+ bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, bool &IsNearTarget);
+
+ StringRef getPassName() const override { return "AMDGPU rematerialize"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+ AU.addRequired<SlotIndexesWrapperPass>();
+ AU.addRequired<LiveIntervalsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+void AMDGPUHotBlockRematerialize::applyRemat(MapVector<Register, RematNode> &RematMap,
+ std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+ llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ MachineFunction &MF) {
+ std::vector<RematNode> UpdateList;
+ for (auto &It : RematMap) {
+ UpdateList.emplace_back(It.second);
+ }
+ // Sort update list with slotIndex to make sure def moved before use.
+ // If use moved before def, It might not be the first use anymore.
+ std::sort(UpdateList.begin(), UpdateList.end(),
+ [&SlotIndexes](RematNode &I, RematNode &J) {
+ SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI);
+ SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI);
+ return A < B;
+ });
+
+ for (RematNode &Node : UpdateList) {
+ if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+ applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
+ } else if (Node.Kind == RematNode::RematKind::Clone) {
+ applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII,
+ MF);
+ }
+ }
+}
+
+unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
+ const GCNSubtarget *ST, unsigned &MaxVPressure,
+ unsigned &MaxSPressure, RematStatus &Status) {
+ // Skip processing current block if It has only debug instructions
+ if (MBB.getFirstNonDebugInstr() == MBB.end())
+ return ST->getOccupancyWithNumVGPRs(0);
+ auto BBEnd = MBB.rbegin();
+ GCNUpwardRPTracker RPTracker(*LIS);
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ if (!llvm::getNonDebugMBBEnd(BBEnd, MBB))
+ return ST->getOccupancyWithNumVGPRs(0);
+
+ GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB];
+ RPTracker.reset(*BBEnd, &OutputLive, true);
+
+ for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) {
+ MachineInstr &MI = (*I++);
+ RPTracker.recede(MI);
+ if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH))
+ Status.MemWriteMBBSet.insert(&MBB);
+ }
+
+ GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
+ unsigned SPressure = RP.getMaxSGPR();
+ if (SPressure > MaxSPressure) {
+ MaxSPressure = SPressure;
+ }
+ if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) {
+ MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+ }
+ Status.MBBPressureMap[&MBB] = RP;
+ return RP.getOccupancy(*ST);
+}
+
+unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST, unsigned &MaxVPressure,
+ unsigned &MaxSPressure, RematStatus &Status) {
+ unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
+ // If only have one block, input/ouput virtual live set are empty.
+ if (MF.size() > 1) {
+ // Build input output live reg first.
+ auto *SlotIndexes = LIS->getSlotIndexes();
+ DenseMap<MachineBasicBlock *, SlotIndex> MBBInputSlotMap;
+ DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+ for (MachineBasicBlock &MBB : MF) {
+ auto BBBegin = MBB.getFirstNonDebugInstr();
+ if (BBBegin != MBB.end()) {
+ auto SI = SlotIndexes->getInstructionIndex(*BBBegin);
+ MBBInputSlotMap[&MBB] = SI;
+ }
+
+ auto BBEnd = MBB.rbegin();
+
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) {
+ auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+ MBBOutputSlotMap[&MBB] = SI;
+ }
+ }
+
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ auto Reg = Register::index2VirtReg(I);
+ if (!LIS->hasInterval(Reg))
+ continue;
+
+ const auto &LI = LIS->getInterval(Reg);
+
+ // Skip local live interval to make live input/ouput faster.
+ if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+ continue;
+
+ for (auto InputIt : MBBInputSlotMap) {
+ MachineBasicBlock *MBB = InputIt.first;
+ auto SI = InputIt.second;
+
+ auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+ if (LiveMask.any())
+ Status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
+ }
+
+ for (auto OutputIt : MBBOutputSlotMap) {
+ MachineBasicBlock *MBB = OutputIt.first;
+ auto SI = OutputIt.second;
+
+ auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+ if (LiveMask.any())
+ Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
+ }
+ }
+ }
+
+ LLVM_DEBUG(
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+ dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) {
+ unsigned Idx = It.first->getNumber();
+ auto LiveReg = It.second;
+ dbgs() << "MBB" << Idx << ":";
+ llvm::dumpLiveSet(LiveReg, SIRI);
+ } dbgs() << "input live";
+ for (auto &It : Status.MBBInputLiveMap) {
+ unsigned Idx = It.first->getNumber();
+ auto LiveReg = It.second;
+ dbgs() << "MBB" << Idx << ":";
+ llvm::dumpLiveSet(LiveReg, SIRI);
+ });
+
+ for (auto It = MF.begin(); It != MF.end(); ++It) {
+ MachineBasicBlock &MBB = *It;
+ unsigned Occ =
+ collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status);
+ if (TgtOcc > Occ)
+ TgtOcc = Occ;
+ }
+ return TgtOcc;
+}
+
+RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST) {
+ unsigned MaxSPressure = 0;
+ unsigned MaxVPressure = 0;
+ RematStatus Status;
+ unsigned TgtOcc =
+ collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status);
+ const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+ if (TgtOcc >= MaxOcc) {
+ Status.TargetOcc = TgtOcc;
+ Status.TargetVLimit = 0;
+ Status.TargetSLimit = 0;
+ Status.MaxVPressure = 0;
+ Status.MaxSPressure = 0;
+ Status.InputPhysicalVPressure = 0;
+ Status.InputPhysicalSPressure = 0;
+ Status.MemBound = false;
+ Status.NotBalance = false;
+ return Status;
+ }
+
+ MaxSPressure += RegForVCC;
+ MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF));
+ unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure);
+ unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure);
+
+ llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI);
+ bool MemBound =
+ TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
+
+ bool NotBalance = false;
+
+ const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU();
+ // Currently, only sgpr bound can be fixed with remat.
+ if (STgtOcc < VTgtOcc) {
+ unsigned BigOcc = std::max(STgtOcc, VTgtOcc);
+ // Change TgtOcc to in case sgpr and vgpr is not balance.
+ if (BigOcc > TgtOcc) {
+ TgtOcc = BigOcc;
+ NotBalance = true;
+ if (TgtOcc >= MaxOccupancy)
+ TgtOcc = MaxOccupancy - 1;
+ }
+ }
+
+ // Collect input physical pressure.
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+ unsigned VInputPressure = 0;
+ uint64_t SInputMask = 0;
+ for (const auto &Livein : MRI.liveins()) {
+ const Register Reg = Livein.first;
+ const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+ assert(Reg.isPhysical() && "input must be physical reg");
+ unsigned RegSize = RC->getLaneMask().getNumLanes();
+ if (SIRI->isVGPR(MRI, Reg)) {
+ VInputPressure += RegSize;
+ } else {
+ unsigned RegIndex = SIRI->getHWRegIndex(Reg);
+ uint64_t Mask = ((1 << RegSize) - 1) << RegIndex;
+ SInputMask |= Mask;
+ }
+ }
+ // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high
+ // pressure.
+ unsigned SInputPressure = 0;
+ uint64_t Mask = 0xf;
+ while (Mask != 0) {
+ if (Mask & SInputMask) {
+ SInputPressure += 4;
+ }
+ Mask = Mask << 4;
+ }
+
+ // If balanced, try next occupancy.
+ TgtOcc = NotBalance ? TgtOcc : (TgtOcc + 1);
+
+ auto CC = MF.getFunction().getCallingConv();
+ bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS;
+ // For shader profiles other than ps/cs, set target profile max as 4.
+ if (!IsPsCs) {
+ TgtOcc = TgtOcc > 4 ? 4 : TgtOcc;
+ }
+ if (TargetOccupancy)
+ TgtOcc = TargetOccupancy;
+
+ unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true);
+ unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc);
+
+ Status.TargetOcc = TgtOcc;
+ Status.TargetVLimit = VLimit;
+ Status.TargetSLimit = SLimit;
+ Status.MaxVPressure = MaxVPressure;
+ Status.MaxSPressure = MaxSPressure;
+ Status.InputPhysicalVPressure = VInputPressure;
+ Status.InputPhysicalSPressure = SInputPressure;
+ Status.MemBound = MemBound;
+ Status.NotBalance = NotBalance;
+ return Status;
+}
+
+// For case like
+// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0,
+// implicit-def dead $scc; xb.uniform
+// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc;
+// xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit
+// killed $scc; xb.uniform
+// Sink S_AND right before S_CSELECT will overwrite SCC.
+// To avoid It, skip case when DefMI and UseMI has implicit define use.
+bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
+ if (DefMI->getDesc().NumImplicitDefs == 0)
+ return false;
+
+ auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo();
+ for (MachineOperand &Def : DefMI->implicit_operands()) {
+ if (!Def.isReg())
+ continue;
+ if (Def.isUse())
+ continue;
+ Register Reg = Def.getReg();
+ if (UseMI->readsRegister(Reg, TRI))
+ return true;
+ }
+ return false;
+}
+
+// SGPR has alignment requirment, cannot get accurate reg number.
+const unsigned NearTargetRegLimit = 10;
+bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST,
+ MachineFunction &MF) {
+ unsigned MaxSGPR = ST->getAddressableNumSGPRs();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ Register ScratchRSrcReg = MFI->getScratchRSrcReg();
+ if (ScratchRSrcReg)
+ MaxSGPR -= 4;
+
+ const unsigned AlignmentDelta = 3;
+ MaxSGPR -= AlignmentDelta;
+
+ return MaxSPressure > MaxSGPR;
+}
+
+// Skip live reg remated to other block.
+void updateLiveInfo(MapVector<Register, RematNode> &RematMap,
+ GCNRPTracker::LiveRegSet &LiveSet,
+ const GCNRPTracker::LiveRegSet &InputLive,
+ MachineBasicBlock *CurBB,
+ DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+ for (auto &It : RematMap) {
+ unsigned Reg = It.first;
+ // Skip reg not in live set.
+ if (!LiveSet.count(Reg))
+ continue;
+ // Skip reg already in input set.
+ // Input set will be taken care in getReducedSize.
+ if (InputLive.count(Reg))
+ continue;
+
+ auto &Node = It.second;
+ if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+ MachineBasicBlock *InsertBB = Node.InsertBlock;
+ // If LiveInfo.BB is after InsertBB in Reverse post order, the def is
+ // still before LiveInfo.BB, It is still live.
+ unsigned LiveBBIndex = RPOTIndexMap[CurBB];
+ unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+ if (LiveBBIndex > InsertBBIndex) {
+ continue;
+ }
+ }
+ // Already in remat map, don't need to check again, remove from
+ // candidate.
+ LiveSet.erase(Reg);
+ }
+}
+
+int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, bool IsVGPR) {
+ int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+ for (MachineOperand &MO : DefMI->operands()) {
+ if (MO.isImm())
+ continue;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ if (MO.isTied())
+ continue;
+
+ if (MO.getReg() == AMDGPU::EXEC)
+ continue;
+
+ // Don't move user of VCC.
+ if (MO.getReg() == AMDGPU::VCC) {
+ RematSize = 0;
+ break;
+ }
+ Register Reg = MO.getReg();
+
+ // Don't move physical register use.
+ if (Reg.isPhysical()) {
+ RematSize = 0;
+ break;
+ }
+
+ if (IsVGPR != SIRI->isVGPR(MRI, Reg)) {
+ // Not support mix of v and s when remat now.
+ // TODO: count possible pressure change here.
+ RematSize = 0;
+ break;
+ }
+ bool IsSingleDef = MRI.hasOneDef(Reg);
+ if (!IsSingleDef) {
+ IsSingleDef = llvm::isSub0Sub1SingleDef(Reg, MRI);
+ }
+
+ if (IsSingleDef) {
+ // The reg might share with other candidates, check It here.
+ // Count share reg in getReducedSize.
+ const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+ if (unsigned SubIdx = MO.getSubReg()) {
+ if (OpRC)
+ OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+ }
+ int InputSize = SIRI->getRegSizeInBits(*OpRC);
+ // If input not live in hotspot, move It cross hotspot should have
+ // less reg then DefMi.
+ if (RematSize > InputSize) {
+ RematSize -= InputSize;
+ continue;
+ }
+ }
+
+ RematSize = 0;
+ break;
+ }
+ return RematSize;
+}
+
+MachineBasicBlock *findNonLoopDominator(MachineBasicBlock *BB,
+ MachineDominatorTree *DT,
+ MachineLoopInfo *LI) {
+ while (LI->getLoopDepth(BB) > 0) {
+ MachineDomTreeNode *N = DT->getNode(BB);
+ if (N == nullptr)
+ return nullptr;
+ MachineDomTreeNode *IDom = N->getIDom();
+ if (IDom == nullptr)
+ return nullptr;
+
+ BB = IDom->getBlock();
+ }
+
+ return BB;
+}
+
+MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT,
+ BlockSet &Blocks) {
+ auto I = Blocks.begin(), E = Blocks.end();
+
+ MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
+ while (I != E) {
+ MachineBasicBlock *B = cast<MachineBasicBlock>(*(I++));
+ DomB = DT->findNearestCommonDominator(DomB, B);
+ if (DomB == nullptr)
+ return nullptr;
+ }
+ // For split block like:
+ // bb.42:
+ // %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec,
+ // // implicit $exec
+ // %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+ // implicitdef $scc, implicit $exec
+ //
+ // bb.68:
+ //; predecessors: %bb.42
+ // successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%),
+ // %bb.43(50.00%)
+ //
+ // SI_MASK_BRANCH %bb.43, implicit $exec
+ // S_BRANCH %bb.45
+ // which is from
+ // bb.42:
+ //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit
+ //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+ // SI_MASK_BRANCH %bb.43, implicit $exec
+ // S_BRANCH %bb.45
+ // The real common dom is bb.42.
+ // TODO: use _term version of exec update instructions so don't need this
+ // anymore.
+ if (DomB && DomB->pred_size() == 1 && !DomB->empty()) {
+ // Upstreaming note: This used to be SI_MASK_BRANCH
+ if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) {
+ MachineBasicBlock *Pred = *DomB->pred_begin();
+ if (Pred->succ_size() == 1 &&
+ (Pred->empty() || !Pred->back().isBranch())) {
+ DomB = Pred;
+ }
+ }
+ }
+
+ return DomB;
+}
+
+MachineBasicBlock *
+findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+ const MachineRegisterInfo &MRI, bool MemBound) {
+
+ BlockSet BBSet;
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ BBSet.insert(UseMI.getParent());
+ }
+ if (BBSet.size() == 0)
+ return nullptr;
+
+ MachineBasicBlock *BB = *BBSet.begin();
+ if (BBSet.size() > 1) {
+ MachineBasicBlock *BDom = nearestCommonDominator(DT, BBSet);
+ if (!BDom)
+ return nullptr;
+ BB = BDom;
+ }
+ // Try to find non loop dominator.
+ if (!MemBound) {
+ BB = findNonLoopDominator(BB, DT, MLI);
+ }
+ if (!BB)
+ return nullptr;
+
+ // If BB is already a hot block, move to BB will not help.
+ // hotBlockRemat will fail It when process BB.
+
+ // Must reachable from DefMI.
+ if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB))
+ return nullptr;
+
+ return BB;
+}
+
+// Maybe expensive to be called all over the place
+bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+ for (auto &Def : DefMI->defs()) {
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) {
+ if (UseMI.isPHI())
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+ // Do not move PHI nodes
+ if (isUsedByPhi(DefMI, MRI))
+ return false;
+
+ unsigned OpNum = DefMI->getNumOperands();
+ // Only move DefMI which all operand is unique def.
+ for (unsigned I = 0; I < OpNum; I++) {
+ MachineOperand &Op = DefMI->getOperand(I);
+ if (!Op.isReg())
+ continue;
+ if (!MRI.getUniqueVRegDef(Op.getReg()) &&
+ !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void addOneDefOneUseCandidate(RematNode &Node,
+ std::vector<RematNode> &RematList,
+ MachineRegisterInfo &MRI, int &RematCnt,
+ MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT,
+ MachineLoopInfo *MLI, bool IsVGPR,
+ bool MemBound) {
+ unsigned Reg = Node.Reg;
+ MachineInstr *DefMI = Node.DefMI;
+
+ unsigned Size = Node.Size;
+ MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin();
+ MachineBasicBlock *InsertBB = UseMI->getParent();
+
+ // For VGPR, always move next to the only user to avoid wqm or exec issue.
+ // But doing this will cause issue when DefMI is in wqm user not in
+ // wqm. Disable VGPR remat for now.
+ // TODO: make sure single user don't need wqm.
+ if (!IsVGPR) {
+ if (MachineBasicBlock *NewInsertBB =
+ findInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, MemBound)) {
+ if (InsertBB != NewInsertBB) {
+ InsertBB = NewInsertBB;
+ // If can find a non-loop insert block, go to the insert block.
+ if (DefMI->getParent() != InsertBB) {
+ if (!InsertBB->empty()) {
+ auto It = InsertBB->getFirstNonPHI();
+ It = skipDebugInstructionsForward(It, InsertBB->end());
+ if (It == InsertBB->end())
+ UseMI = nullptr;
+ else
+ UseMI = &*It;
+ }
+ }
+ }
+ }
+ }
+
+ if (IsVGPR) {
+ // Don't count reg in same block for valu.
+ if (UseMI->getParent() == DefMI->getParent())
+ return;
+ }
+
+ // Skip case when DefMI has implicit define which used by UseMI.
+ if (isImplicitDefUse(DefMI, UseMI)) {
+ return;
+ }
+
+ Node.InsertBlock = InsertBB;
+ Node.InsertPointMI = UseMI;
+ Node.Kind = RematNode::RematKind::OneDefOneUse;
+ RematList.emplace_back(Node);
+ RematCnt += Size;
+}
+
+void buildRematCandiates(std::vector<RematNode> &Candidates,
+ GCNRPTracker::LiveRegSet &CandidateRegSet,
+ DenseSet<unsigned> &PinnedRegSet,
+ const MachineRegisterInfo &MRI,
+ const SIInstrInfo *SIII, const SIRegisterInfo *SIRI,
+ bool IsVGPR) {
+
+ for (auto LiveRegIt : CandidateRegSet) {
+ unsigned Reg = LiveRegIt.first;
+ // Skip unsafe reg.
+ if (PinnedRegSet.count(Reg))
+ continue;
+
+ if (SIRI->isVGPR(MRI, Reg) != IsVGPR)
+ continue;
+ bool IsSafeCandidate = true;
+ MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+ if (MI) {
+ if (IsVGPR) {
+ // Only remat valu now.
+ if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY)
+ IsSafeCandidate = false;
+ if (MI->getOpcode() == AMDGPU::COPY) {
+ // Make sure src is unique define.
+ if (MI->getOperand(1).isReg() &&
+ nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg()))
+ IsSafeCandidate = false;
+ } else {
+ // Skip convergent valu.
+ if (MI->isConvergent())
+ IsSafeCandidate = false;
+ }
+ }
+ // Skip inst has more than 1 def.
+ if (MI->getDesc().NumDefs > 1)
+ IsSafeCandidate = false;
+ } else {
+ IsSafeCandidate = false;
+ }
+
+ if (IsSafeCandidate) {
+ int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR);
+ if (Gain > 0) {
+ Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5));
+ } else {
+ IsSafeCandidate = false;
+ }
+ }
+ // Save unsafe reg.
+ if (!IsSafeCandidate)
+ PinnedRegSet.insert(Reg);
+ }
+
+ // Sort by gain.
+ std::sort(Candidates.begin(), Candidates.end(),
+ [](RematNode &I, RematNode &J) { return I.Size > J.Size; });
+}
+
+void addCloneCandidate(std::vector<RematNode *> &CloneList,
+ std::vector<RematNode> &RematList,
+ DenseSet<unsigned> &PinnedRegSet,
+ MachineRegisterInfo &MRI, int &RematCnt) {
+ // Group user in same blocks.
+ std::vector<BlockSet> UserSetList(CloneList.size());
+
+ for (size_t I = 0; I < CloneList.size(); I++) {
+ auto *Node = CloneList[I];
+ unsigned Reg = Node->Reg;
+ MachineInstr *DefMI = Node->DefMI;
+ // Group user in same blocks.
+ BlockSet &UserSet = UserSetList[I];
+
+ for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+ UseIt != MRI.use_instr_nodbg_end();) {
+ MachineInstr &UseMI = *(UseIt++);
+ UserSet.insert(UseMI.getParent());
+ }
+
+ if (UserSet.size() == 1) {
+ // All users are in same block with DefMI.
+ if (*UserSet.begin() == DefMI->getParent()) {
+ // Mark cannot remat for now.
+ // TODO: try to split if is bigger than 4 and only used once per
+ // channel.
+ PinnedRegSet.insert(Reg);
+ continue;
+ }
+ }
+
+ int Size = Node->Size;
+ Size <<= 16;
+ // Pack userSet size to size.
+ Size |= UserSet.size();
+ Node->UserCount = Size;
+ }
+
+ std::sort(CloneList.begin(), CloneList.end(),
+ // Sort based on userSet size.
+ [](const RematNode *A, const RematNode *B) {
+ static constexpr int Mask = 0xffff;
+ return (A->UserCount & Mask) < (B->UserCount & Mask);
+ });
+
+ for (RematNode *Node : CloneList) {
+ Node->Kind = RematNode::RematKind::Clone;
+ RematList.emplace_back(*Node);
+ RematCnt += Node->Size;
+ }
+}
+
+int filterRematCandiates(std::vector<RematNode> &Candidates,
+ std::vector<RematNode> &RematList,
+ DenseSet<unsigned> &PinnedRegSet,
+ MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+ MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) {
+ int RematCnt = 0;
+ // Work one def one use first.
+ for (auto &Node : Candidates) {
+ unsigned Reg = Node.Reg;
+ if (!MRI.hasOneNonDBGUse(Reg)) {
+ continue;
+ }
+ MachineInstr *DefMI = Node.DefMI;
+ if (!isSafeToMove(DefMI, MRI)) {
+ PinnedRegSet.insert(Reg);
+ continue;
+ }
+
+ addOneDefOneUseCandidate(Node, RematList, MRI, RematCnt, DT, PDT, MLI,
+ IsVGPR, MemBound);
+ }
+
+ if (!IsVGPR) {
+ std::vector<RematNode *> CloneList;
+ // Try multi use case.
+ for (auto &Node : Candidates) {
+ unsigned Reg = Node.Reg;
+ if (MRI.hasOneNonDBGUse(Reg)) {
+ continue;
+ }
+ MachineInstr *DefMI = Node.DefMI;
+ if (!isSafeToMove(DefMI, MRI)) {
+ PinnedRegSet.insert(Reg);
+ continue;
+ }
+
+ // Clone for each user.
+ CloneList.emplace_back(&Node);
+ }
+
+ addCloneCandidate(CloneList, RematList, PinnedRegSet, MRI, RematCnt);
+ }
+
+ return RematCnt;
+}
+
+int getReducedSize(MapVector<Register, RematNode> &RematMap,
+ GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts,
+ const MachineRegisterInfo &MRI, BlockLiveInfo &LiveInfo,
+ DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+ int ReducedSize = 0;
+ for (auto &It : RematMap) {
+ Register Reg = It.first;
+
+ if (!CanidateSet.count(Reg))
+ continue;
+
+ bool IsReduced = false;
+ auto &Node = It.second;
+ if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+ MachineBasicBlock *InsertBB = Node.InsertBlock;
+ // If LiveInfo.BB is before InsertBB in Reverse post order, the def is
+ // moved after LiveInfo.BB, It is not live anymore.
+ unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB];
+ unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+ if (LiveBBIndex < InsertBBIndex)
+ IsReduced = true;
+ } else {
+ // Clone.
+ IsReduced = true;
+ // If has use in LiveInfo.BB, could not reduce from input live.
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ if (UseMI.getParent() == LiveInfo.BB) {
+ IsReduced = false;
+ break;
+ }
+ }
+ }
+ if (IsReduced) {
+ ReducedSize += Node.Size;
+ ReducedInsts.insert(Node.DefMI);
+ }
+
+ // Already in remat map, don't need to check again, remove from candidate.
+ CanidateSet.erase(Reg);
+ }
+
+ return ReducedSize;
+}
+
+int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
+
+ // Find shared operand in ReducedInsts.
+ int SharedSize = 0;
+ DenseMap<unsigned, LaneBitmask> SharedRegMaskMap;
+ for (MachineInstr *DefMI : ReducedInsts) {
+ for (MachineOperand &MO : DefMI->operands()) {
+ if (MO.isImm())
+ continue;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ if (MO.isTied())
+ continue;
+ Register Reg = MO.getReg();
+
+ if (Reg == AMDGPU::EXEC)
+ continue;
+ if (!Reg.isVirtual())
+ continue;
+
+ if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) {
+ // Not support mix of v and s when remat now.
+ continue;
+ }
+
+ const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+ int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+ unsigned Mask;
+ if (unsigned SubIdx = MO.getSubReg()) {
+ OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+ int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+ Mask = (1 << SubMOSize) - 1;
+ } else {
+ Mask = (1 << MOSize) - 1;
+ }
+ auto SharedRegIt = SharedRegMaskMap.find(Reg);
+ if (SharedRegIt == SharedRegMaskMap.end()) {
+ SharedRegMaskMap[Reg] = LaneBitmask(Mask);
+ } else {
+ unsigned PrevMask = SharedRegIt->second.getAsInteger();
+ if (unsigned SharedMask = (PrevMask & Mask)) {
+ // Some thing is shared.
+ for (int I = 0; I < MOSize; I++) {
+ if (SharedMask & (1 << I)) {
+ SharedSize += 1;
+ }
+ }
+ }
+ LaneBitmask MoMask = LaneBitmask(Mask | PrevMask);
+ SharedRegMaskMap[Reg] = MoMask;
+ }
+ }
+ }
+ return SharedSize;
+}
+
+void dumpRematMap(MapVector<Register, RematNode> &RematMap,
+ const SIRegisterInfo *SIRI) {
+ dbgs() << "\n rematMap: \n";
+ for (auto It : RematMap) {
+ int Reg = It.first;
+ dbgs() << printReg(Reg, SIRI);
+ dbgs() << "\n";
+ }
+}
+int DebugBlockIndex = 42;
+void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet,
+ MapVector<Register, RematNode> &VRematMap,
+ MapVector<Register, RematNode> &SRematMap, int BlockIndex,
+ const SIRegisterInfo *SIRI) {
+ if (DebugBlockIndex != BlockIndex)
+ return;
+ llvm::dumpLiveSet(LiveSet, SIRI);
+ dumpRematMap(VRematMap, SIRI);
+ dumpRematMap(SRematMap, SIRI);
+}
+
+void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
+ const SIRegisterInfo *SIRI) {
+ if (DebugBlockIndex != BlockIndex)
+ return;
+ dbgs() << "\n Candidates: \n";
+ unsigned TotalSize = 0;
+ for (RematNode &Node : RematCandidates) {
+ dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size;
+ dbgs() << "\n";
+ TotalSize += Node.Size;
+ }
+ dbgs() << "Total Size:" << TotalSize << "\n";
+}
+
+bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, bool &IsNearTarget) {
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+ const SIInstrInfo *SIII = ST->getInstrInfo();
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+ ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+ DenseMap<MachineBasicBlock *, unsigned> RPOTIndexMap;
+ for (MachineBasicBlock *MBB : RPOT) {
+ RPOTIndexMap[MBB] = RPOTIndexMap.size();
+ }
+
+ auto &MRI = MF.getRegInfo();
+
+ bool IsUpdated = false;
+ RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST);
+
+ const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+ if (Status.TargetOcc >= MaxOcc)
+ return false;
+
+ unsigned VLimit = Status.TargetVLimit;
+ unsigned SLimit = Status.TargetSLimit;
+
+ int RematSCnt = Status.MaxSPressure - SLimit;
+
+ bool IsSGPRSpill = false;
+ if (RematSCnt > 0) {
+ IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
+ }
+
+ const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
+
+ // If bound by lds, skip.
+ if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
+ !IsForceRematSgpr)
+ return false;
+
+ MachineBasicBlock *EntryMBB = &MF.front();
+
+ auto *SlotIndexes = LIS->getSlotIndexes();
+
+ // Reg which already marked remat.
+ MapVector<Register, RematNode> VRematMap;
+ MapVector<Register, RematNode> SRematMap;
+ // Reg which cannot move around to remat.
+ DenseSet<unsigned> PinnedRegSet;
+ std::vector<BlockLiveInfo> HotBlocks;
+ for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) {
+ MachineBasicBlock *MBB = *It;
+ auto &RP = Status.MBBPressureMap[MBB];
+ // ignore block not hot.
+ if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit &&
+ (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) <
+ Status.TargetSLimit)
+ continue;
+ // Collect reg pressure.
+ unsigned MaxVPressure = 0;
+ unsigned MaxSPressure = 0;
+ const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB];
+
+ const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB];
+ LLVM_DEBUG(
+ dumpHotBlock(InputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI));
+
+ GCNDownwardRPTracker Tracker(*LIS);
+
+ Tracker.reset(*MBB->begin(), &InputLive);
+
+ for (MachineInstr &MI : *MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ Tracker.advance();
+ auto LISLR = Tracker.getLiveRegs();
+ // Update live set for things already remated.
+ updateLiveInfo(VRematMap, LISLR, InputLive, MBB, RPOTIndexMap);
+ updateLiveInfo(SRematMap, LISLR, InputLive, MBB, RPOTIndexMap);
+
+ const GCNRPTracker::LiveRegSet &LiveSet = LISLR;
+ unsigned VPressure = 0;
+ unsigned SPressure = 0;
+ collectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure);
+ if (MaxVPressure < VPressure)
+ MaxVPressure = VPressure;
+ if (MaxSPressure < SPressure)
+ MaxSPressure = SPressure;
+ }
+ MaxSPressure += RegForVCC + Status.InputPhysicalSPressure;
+ if (MaxVPressure <= VLimit && MaxSPressure <= SLimit)
+ continue;
+
+ // Build block live info.
+ // Use outputLive for EntryMBB.
+ BlockLiveInfo LiveInfo = {MBB, MaxSPressure, MaxVPressure,
+ MBB != EntryMBB ? InputLive : OutputLive};
+ // Skip entry block when save hotBlock to reduce clone because not clone in
+ // entry block.
+ if (MBB != EntryMBB)
+ HotBlocks.emplace_back(LiveInfo);
+ GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive;
+
+ // Update reg pressure based on remat list.
+ InstSet VReducedInsts;
+ InstSet SReducedInsts;
+ int VReduced = getReducedSize(VRematMap, CandidateRegs, VReducedInsts, MRI,
+ LiveInfo, RPOTIndexMap);
+ int SReduced = getReducedSize(SRematMap, CandidateRegs, SReducedInsts, MRI,
+ LiveInfo, RPOTIndexMap);
+
+ // Calculate size need to be remat.
+ int RematVCnt = MaxVPressure - VReduced - VLimit;
+ int RematSCnt = MaxSPressure - SReduced - SLimit;
+
+ bool IsSGPRSpill = false;
+ if (RematSCnt > 0) {
+ IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF);
+ }
+ bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
+ // Try to add candidates into remat list.
+
+ int NewRematSCnt = 0;
+ if (RematSCnt > 0) {
+ // Build candidate nodes.
+ std::vector<RematNode> SRematCandidates;
+ buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
+ SIII, SIRI, /*IsVGPR*/ false);
+
+ LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
+ std::vector<RematNode> SRematList;
+ // Filter candidates.
+ NewRematSCnt = filterRematCandiates(SRematCandidates, SRematList,
+ PinnedRegSet, DT, PDT, MLI, MRI,
+ /*IsVGPR*/ false, Status.MemBound);
+ if (NewRematSCnt > RematSCnt) {
+ // Has enough remat node to cover rematCnt.
+ int RematCnt = 0;
+ for (RematNode &Node : SRematList) {
+ SRematMap[Node.Reg] = Node;
+ RematCnt += Node.Size;
+ if (RematCnt > RematSCnt)
+ break;
+ }
+ NewRematSCnt = 0;
+ } else {
+
+ for (RematNode &Node : SRematList) {
+ SReducedInsts.insert(Node.DefMI);
+ }
+ // Check shared size.
+ int SharedReducedSize =
+ getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI);
+ if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
+ RematSCnt) {
+ for (RematNode &Node : SRematList) {
+ SRematMap[Node.Reg] = Node;
+ }
+ } else {
+ if (!IsForceRematSgpr)
+ return false;
+ for (RematNode &Node : SRematList) {
+ SRematMap[Node.Reg] = Node;
+ }
+ // Find local one def one use candidates.
+ for (MachineInstr &MI : *MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ if (MI.getDesc().NumDefs != 1)
+ continue;
+ MachineOperand &DstMO = MI.getOperand(0);
+ Register Reg = DstMO.getReg();
+ if (!SIRI->isSGPRReg(MRI, Reg))
+ continue;
+ if (!MRI.hasOneNonDBGUse(Reg))
+ continue;
+ if (!MRI.hasOneDef(Reg))
+ continue;
+ if (Reg.isPhysical())
+ continue;
+ MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
+ if (UseMI.getParent() != MBB)
+ continue;
+ int Gain = rematGain(&MI, Reg, MRI, SIRI,
+ /*IsVGPR*/ false);
+ if (Gain > 0) {
+ // Skip case when DefMI has implicit define which used by UseMI.
+ if (isImplicitDefUse(&MI, &UseMI)) {
+ continue;
+ }
+ RematNode Node = {Reg, &MI, (unsigned)Gain >> 5};
+ Node.InsertPointMI = &UseMI;
+ Node.Kind = RematNode::RematKind::OneDefOneUse;
+ SRematMap[Reg] = Node;
+ SharedReducedSize += Node.Size;
+ }
+ }
+ }
+ NewRematSCnt = RematSCnt - NewRematSCnt - SharedReducedSize;
+ }
+ }
+ // If works, continue.
+
+ // Collect live range from hot inst.
+ // find common live range in hot insts.
+ // Remat these common live range.
+ // Apply the remat.
+
+ int NewRematVCnt = 0;
+ if (RematVCnt > 0) {
+ // TODO: V remat.
+ }
+
+ bool NeedSRemat = RematSCnt > 0;
+ bool NeedVRemat = RematVCnt > 0;
+ // If sgpr spill, always do remat.
+ bool IsSRematOK =
+ (NewRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr;
+ bool IsVRematOK =
+ (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty();
+ if (NeedSRemat && NeedVRemat) {
+ if (IsVRematOK && IsSRematOK) {
+ IsUpdated = true;
+ } else if (IsSGPRSpill) {
+ IsUpdated = true;
+ }
+ } else if (NeedSRemat) {
+ if (IsSRematOK) {
+ IsUpdated = true;
+ }
+ } else if (NeedVRemat) {
+ if (IsVRematOK) {
+ IsUpdated = true;
+ }
+ }
+ // TODO: what to do when cannot reach target?
+ if (NewRematSCnt > 0) {
+ if ((unsigned)NewRematSCnt <= NearTargetRegLimit) {
+ IsNearTarget = true;
+ } else {
+ if (!IsSGPRSpill)
+ return false;
+ }
+ }
+ }
+
+ if (SRematMap.empty() && VRematMap.empty()) {
+ return IsUpdated;
+ }
+
+ if (!SRematMap.empty()) {
+ IsUpdated = true;
+ applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII,
+ MF);
+ LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
+ }
+
+ // Balance between vector and scalar if possible.
+ return IsUpdated;
+}
+
+bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.size() < 2)
+ return false;
+ LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+ MachineDominatorTree *DT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MachinePostDominatorTree *PDT =
+ &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+ MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+
+ bool IsNearTarget = false;
+ return hotBlockRemat(MF, MLI, LIS, DT, PDT, IsNearTarget);
+}
+
+} // namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+ "AMDGPU rematerialize", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+ "AMDGPU rematerialize", false, false)
+
+char AMDGPUHotBlockRematerialize::ID = 0;
+char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
+
+FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
+ return new AMDGPUHotBlockRematerialize();
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
new file mode 100644
index 0000000000000..dc8b67e368516
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -0,0 +1,217 @@
+//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMIRUtils.h"
+#include "SIRegisterInfo.h"
+#include "SIInstrInfo.h"
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+
+#define DEBUG_TYPE "xb-mir-util"
+using namespace llvm;
+
+namespace llvm {
+bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
+ MachineBasicBlock &MBB) {
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ while (BBEnd != MBB.rend() && BBEnd->isDebugInstr())
+ BBEnd++;
+ return BBEnd != MBB.rend();
+}
+} // namespace llvm
+
+namespace {
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+ MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
+ MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
+ // Treat non inst as not local.
+ if (!StartMI || !EndMI)
+ return false;
+ // is local when parent MBB the same.
+ bool IsSameMBB = StartMI->getParent() == EndMI->getParent();
+ if (!IsSameMBB)
+ return false;
+ // Collect touched MBB.
+ MachineBasicBlock *MBB = StartMI->getParent();
+ TouchedMBBSet.insert(MBB);
+ return true;
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+ for (const LiveRange::Segment &Seg : Range->segments) {
+ if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet))
+ return false;
+ }
+ return true;
+}
+
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) {
+ MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
+ MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
+ // Treat non inst as not local.
+ if (!StartMI || !EndMI)
+ return false;
+ // is local when parent MBB the same.
+ return StartMI->getParent() == EndMI->getParent();
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
+ for (const LiveRange::Segment &Seg : Range->segments) {
+ if (!isLocalSegment(&Seg, Indexes))
+ return false;
+ }
+ return true;
+}
+
+} // namespace
+
+// In case like float4 v, v.x used and defined in one block, v.y used and define
+// in another block, one live interval could touch more than one MBB.
+// TouchedMBBSet is used for scheduling where local live interval could cross
+// multiple regions, need to calculate livereg for each region inside touched
+// MBB.
+bool llvm::isLocalLiveInterval(
+ const LiveInterval &LI, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges()) {
+ if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
+ return false;
+ }
+ }
+ return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
+}
+
+bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges()) {
+ if (!isLocalLiveRange(&S, Indexes))
+ return false;
+ }
+ }
+ return isLocalLiveRange(&LI, Indexes);
+}
+
+void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
+
+ dbgs() << "\n live set: \n";
+ for (auto It : LiveSet) {
+ int Reg = It.first;
+ dbgs() << printReg(Reg, SIRI);
+ if (It.second.any()) {
+ dbgs() << " mask:" << It.second.getAsInteger();
+ }
+ dbgs() << "\n";
+ }
+}
+
+namespace llvm {
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI) {
+ unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+ Size >>= 5;
+ if (Mask.any()) {
+ if (unsigned MaskSize = Mask.getNumLanes()) {
+ if (MaskSize < Size)
+ Size = MaskSize;
+ }
+ }
+ return Size;
+}
+
+void collectLiveSetPressure(const LiveSet &LiveSet,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, unsigned &VPressure,
+ unsigned &SPressure) {
+ VPressure = 0;
+ SPressure = 0;
+ for (auto LiveIt : LiveSet) {
+ unsigned Reg = LiveIt.first;
+ unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI);
+ if (SIRI->isVGPR(MRI, Reg)) {
+ VPressure += Size;
+ } else {
+ SPressure += Size;
+ }
+ }
+}
+
+bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+ // Support multi def for pattern of pointer:
+ // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+ // %808.sub1:sgpr_64 = S_MOV_B32 0
+ bool HasSub0 = false;
+ bool HasSub1 = false;
+ for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) {
+ if (unsigned SubReg = UserDefMO.getSubReg()) {
+ bool IsSingleSubReg = false;
+ switch (SubReg) {
+ default:
+ break;
+ case AMDGPU::sub0:
+ if (!HasSub0) {
+ HasSub0 = true;
+ IsSingleSubReg = true;
+ }
+ break;
+ case AMDGPU::sub1:
+ if (!HasSub1) {
+ HasSub1 = true;
+ IsSingleSubReg = true;
+ }
+ break;
+ }
+ if (!IsSingleSubReg) {
+ HasSub0 = false;
+ break;
+ }
+ } else {
+ HasSub0 = false;
+ break;
+ }
+ }
+
+ return (HasSub0 && HasSub1);
+}
+
+bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+ MachineBasicBlock *ToBB) {
+ if (FromBB == ToBB) {
+ return true;
+ }
+
+ if (DT->dominates(FromBB, ToBB)) {
+ return true;
+ }
+
+ if (PDT->dominates(ToBB, FromBB)) {
+ return true;
+ }
+
+ if (loopContainsBoth(LI, ToBB, FromBB)) {
+ return true;
+ }
+ // TODO: cover case hotBB in loop,
+ // one block in that loop dom BB or
+ // BB post dom one block in that loop.
+ return false;
+}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
new file mode 100644
index 0000000000000..c4452c91a43a8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -0,0 +1,62 @@
+//===------- AMDGPUMIRUtils.h - Helpers for MIR passes --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+class LiveInterval;
+class SlotIndexes;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class MachineDominatorTree;
+class MachinePostDominatorTree;
+
+constexpr unsigned RegForVCC = 2;
+
+bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
+ llvm::MachineBasicBlock &MBB);
+
+// Check if LI live cross basic blocks, save all touched basic block if is
+// local.
+bool isLocalLiveInterval(
+ const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
+ llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &TouchedMBBSet);
+bool isLocalLiveInterval(const llvm::LiveInterval &LI,
+ llvm::SlotIndexes *Indexes);
+
+bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI);
+void collectLiveSetPressure(const LiveSet &LiveSet,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI,
+ unsigned &VPressure, unsigned &SPressure);
+
+bool reach_block(llvm::MachineBasicBlock *FromBB,
+ llvm::MachineDominatorTree *DT,
+ llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
+ llvm::MachineBasicBlock *ToBB);
+}
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
new file mode 100644
index 0000000000000..32301130606a7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -0,0 +1,18 @@
+//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==------------------------------------------------------------------------==//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//==------------------------------------------------------------------------==//
+
+namespace llvm {
+}
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
new file mode 100644
index 0000000000000..f9be0a2c73d86
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -0,0 +1,53 @@
+//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
+
+namespace llvm {
+
+class MachineFunction;
+class GCNSubtarget;
+class MachineLoopInfo;
+
+struct SchedScore {
+ // Score for this Sched result.
+ unsigned Occupancy = 0;
+ bool SgprSpill = false;
+ unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass?
+ unsigned MemLatency = 0; // Only save mem latency.
+ // We want mem latency small and hide big. Compare
+ // memLatency - hide * Occ, smaller is better.
+ unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1.
+ unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
+ unsigned Lds = 0; // Todo: count lds.
+ SchedScore() {}
+
+ // Other info which can help compare schedule result.
+ float computeScore() const;
+ float computeScore2() const;
+
+ void sum(const SchedScore &S, unsigned LoopDepth = 0);
+ bool isBetter(const SchedScore &S) const;
+ bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
+ // More latency can be hiden with ExtraOcc.
+ unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
+};
+
+SchedScore collectLatency(llvm::MachineFunction &MF,
+ const llvm::GCNSubtarget &ST,
+ const llvm::MachineLoopInfo *MLI = nullptr);
+
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 90e3489ced923..9c1aec6cd047d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -397,6 +397,12 @@ static cl::opt<bool>
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);
+// Enable Hot block rematerialize
+static cl::opt<bool>
+ EnableHotBlockRemat("amdgpu-enable-hot-block-remat",
+ cl::desc("Enable HotBlock Rematerialize optimization"),
+ cl::init(false), cl::Hidden);
+
// Enable GFX11+ VOPD
static cl::opt<bool>
EnableVOPD("amdgpu-enable-vopd",
@@ -521,6 +527,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
+ initializeAMDGPUHotBlockRematerializePass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
@@ -1539,6 +1546,10 @@ void GCNPassConfig::addOptimizedRegAlloc() {
if (TM->getOptLevel() > CodeGenOptLevel::Less)
insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+ // Rematerialize must be run before phi elimination
+ if (isPassEnabled(EnableHotBlockRemat))
+ addPass(&AMDGPUHotBlockRematerializeID);
+
TargetPassConfig::addOptimizedRegAlloc();
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..79fdbba1d0db1 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelDivergenceLowering.cpp
AMDGPUGlobalISelUtils.cpp
+ AMDGPUHotBlockRematerialize.cpp
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
@@ -81,10 +82,12 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUMemoryUtils.cpp
+ AMDGPUMIRUtils.cpp
AMDGPUIGroupLP.cpp
AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
+ AMDGPUOccupancyAndLatencyHelper.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
AMDGPUPreLegalizerCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 7554b9f578fcb..aa4b3f948b726 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -47,6 +47,10 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
+ unsigned getMaxSGPR() const {
+ return std::max(getSGPRNum(), getSGPRTuplesWeight());
+ }
+
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR32]; }
/// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
>From 6854976b4d2ae4af1d3caba6ef2b5c39c7925d2d Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Fri, 18 Apr 2025 15:24:29 -0700
Subject: [PATCH 2/6] First build
---
.../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 237 ++++++++-
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 467 +++++++++++++++++-
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 40 ++
.../AMDGPUOccupancyAndLatencyHelper.cpp | 151 ++++++
.../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 27 +
5 files changed, 909 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 70b25beeb22b9..95237062a6093 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -37,6 +37,7 @@ namespace {
typedef DenseSet<MachineInstr *> InstSet;
typedef DenseSet<MachineBasicBlock *> BlockSet;
+template <typename T> using BlockMap = MapVector<MachineBasicBlock *, T>;
struct RematNode {
enum class RematKind {
@@ -107,20 +108,17 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void applyCloneRemat(RematNode &Node,
+ std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF);
void applyRemat(MapVector<Register, RematNode> &RematMap,
std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
MachineFunction &MF);
- void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
- llvm::SlotIndexes *SlotIndexes,
- const SIRegisterInfo *SIRI,
- const SIInstrInfo *SIII);
- void applyCloneRemat(RematNode &Node,
- std::vector<BlockLiveInfo> &HotBlocks,
- MachineDominatorTree *DT, MachineRegisterInfo &MRI,
- llvm::SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
- const SIInstrInfo *SIII, MachineFunction &MF);
bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
LiveIntervals *LIS, MachineDominatorTree *DT,
MachinePostDominatorTree *PDT, bool &IsNearTarget);
@@ -138,6 +136,227 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
}
};
+MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
+ MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ const bool WillSmashScc =
+ InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+ if (WillSmashScc) {
+ CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef(
+ MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+ }
+
+ return CurrentInsertPoint;
+}
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+ unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &UserBlocks,
+ DenseSet<MachineBasicBlock *> &UserMBBSet,
+ std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT) {
+ // Collect hot blocks which Exp is live in.
+ DenseSet<MachineBasicBlock *> HotBlockSet;
+ for (BlockLiveInfo &HotBlock : HotBlocks) {
+ if (HotBlock.InputLive.count(Reg)) {
+ HotBlockSet.insert(HotBlock.BB);
+ }
+ }
+
+ // For userBlocks which dominate all hotBlocks, don't need to clone because
+ // the value not cross hotBlocks when later blocks are cloned.
+ // For userBlocks which dominated by all hotBlocks, they could share clones
+ // because once after hot block, the pressure is OK.
+ DenseSet<MachineBasicBlock *> AfterHotRangeMBBs;
+ for (MachineBasicBlock *MBB : UserMBBSet) {
+ // Always clone in hot block.
+ if (HotBlockSet.count(MBB))
+ continue;
+
+ bool IsDomAllHotBlocks = true;
+ bool IsDomedByAllHotBlocks = true;
+ for (MachineBasicBlock *HotMBB : HotBlockSet) {
+ if (!DT->dominates(MBB, HotMBB)) {
+ IsDomAllHotBlocks = false;
+ }
+ if (!DT->dominates(HotMBB, MBB)) {
+ IsDomedByAllHotBlocks = false;
+ }
+ if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
+ break;
+ }
+ }
+ if (IsDomAllHotBlocks) {
+ UserBlocks.erase(MBB);
+ } else if (IsDomedByAllHotBlocks) {
+ AfterHotRangeMBBs.insert(MBB);
+ }
+ }
+
+ // Split after hotRange block set by domtree.
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+ if (!AfterHotRangeMBBs.empty()) {
+ for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+ for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) {
+ if (MBB == MBB2)
+ continue;
+ if (DT->dominates(MBB, MBB2)) {
+ auto &Dom = DomMap[MBB];
+ Dom.insert(MBB2);
+ auto &Dom2 = DomMap[MBB2];
+ Dom.insert(Dom2.begin(), Dom2.end());
+ }
+ }
+ }
+ for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+ auto &Dom = DomMap[MBB];
+ for (MachineBasicBlock *DomedMBB : Dom) {
+ // Remove domedMBB.
+ DomMap.erase(DomedMBB);
+ UserMBBSet.erase(DomedMBB);
+ }
+ }
+ }
+
+ return DomMap;
+}
+
+void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
+ SmallVector<MachineInstr *, 2> &UserMIs) {
+ for (MachineInstr *UseMI : UserMIs) {
+ for (MachineOperand &MO : UseMI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == Reg) {
+ MO.setReg(NewReg);
+ if (IsSubRegDef)
+ MO.setSubReg(0);
+ }
+ }
+ }
+}
+
+void AMDGPUHotBlockRematerialize::applyCloneRemat(RematNode &Node,
+ std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF) {
+ unsigned Reg = Node.Reg;
+
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ auto DefOp = DefMI->getOperand(0);
+ const MCInstrDesc &Desc = DefMI->getDesc();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ // When the unique def has subReg, just create newReg for the subReg part.
+ bool IsSubRegDef = false;
+ if (DefOp.getSubReg() != 0) {
+ RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
+ IsSubRegDef = true;
+ }
+ const DebugLoc DL = DefMI->getDebugLoc();
+ unsigned OpNum = DefMI->getNumOperands();
+
+ Node.Kind = RematNode::RematKind::Clone;
+
+ // Group user in same blocks.
+ BlockMap<SmallVector<MachineInstr *, 2>> UserMap;
+ DenseSet<MachineBasicBlock *> UserMBBSet;
+ for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+ UseIt != MRI.use_instr_nodbg_end();) {
+ MachineInstr &UseMI = *(UseIt++);
+ UserMap[UseMI.getParent()].emplace_back(&UseMI);
+ UserMBBSet.insert(UseMI.getParent());
+ }
+
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+ reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT);
+
+ for (auto UseIt : UserMap) {
+ MachineBasicBlock *MBB = UseIt.first;
+ // Skip same block uses.
+ if (MBB == DefMI->getParent()) {
+ continue;
+ }
+ // Skip MBB which share clone from other MBBs.
+ if (UserMBBSet.count(MBB) == 0)
+ continue;
+
+ Register NewReg = MRI.createVirtualRegister(RC);
+ auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
+ for (unsigned I = 1; I < OpNum; I++) {
+ NewDef = NewDef.add(DefMI->getOperand(I));
+ }
+
+ MachineInstr *InsertPointMI = UseIt.second.front();
+ SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
+
+ for (MachineInstr *UseMI : UseIt.second) {
+ SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI);
+ if (LastSlot > Slot) {
+ LastSlot = Slot;
+ InsertPointMI = UseMI;
+ }
+ }
+
+ MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash(
+ DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII);
+
+ for (MachineMemOperand *MO : DefMI->memoperands()) {
+ NewDef->addMemOperand(MF, MO);
+ }
+
+ MBB->insert(InsertPoint, NewDef);
+
+ SlotIndexes->insertMachineInstrInMaps(*NewDef);
+
+ SmallVector<MachineInstr *, 2> &UserMIs = UseIt.second;
+ updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
+
+ // update users in dom MBBs.
+ auto DomMapIt = DomMap.find(MBB);
+ if (DomMapIt != DomMap.end()) {
+ for (MachineBasicBlock *UpdateMBB : DomMapIt->second) {
+ SmallVector<MachineInstr *, 2> &UserMIs = UserMap[UpdateMBB];
+ updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
+ }
+ }
+
+ llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes);
+ }
+ if (MRI.use_empty(Reg)) {
+ SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+ }
+}
+
+void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes,
+ const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII) {
+ MachineInstr *DefMI = Node.DefMI;
+ MachineInstr *InsertPointMI = Node.InsertPointMI;
+ MachineBasicBlock *MBB = nullptr;
+
+ // Find a valid insert point.
+ MachineBasicBlock::iterator InsertPoint;
+ if (InsertPointMI) {
+ InsertPoint = InsertPointMI->getIterator();
+ MBB = InsertPointMI->getParent();
+ } else {
+ InsertPoint = Node.InsertBlock->getFirstTerminator();
+ MBB = Node.InsertBlock;
+ }
+
+ InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
+ SIRI, SIII);
+
+ // Move instruction to new location.
+ DefMI->removeFromParent();
+ InsertPoint->getParent()->insert(InsertPoint, DefMI);
+
+ // Update slot index.
+ SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+ SlotIndexes->insertMachineInstrInMaps(*DefMI);
+}
+
void AMDGPUHotBlockRematerialize::applyRemat(MapVector<Register, RematNode> &RematMap,
std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index dc8b67e368516..6d6bd38c61c06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -19,6 +19,7 @@
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#define DEBUG_TYPE "xb-mir-util"
using namespace llvm;
@@ -79,14 +80,132 @@ bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
return true;
}
+// LoopInfo contains a mapping from basic block to the innermost loop. Find
+// the outermost loop in the loop nest that contains BB.
+const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI,
+ const MachineBasicBlock *BB) {
+ const MachineLoop *L = LI->getLoopFor(BB);
+ if (L) {
+ while (const MachineLoop *Parent = L->getParentLoop())
+ L = Parent;
+ }
+ return L;
+}
+
+bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
+ const MachineBasicBlock *BB2) {
+ const MachineLoop *L1 = getOutermostLoop(LI, BB1);
+ const MachineLoop *L2 = getOutermostLoop(LI, BB2);
+ return L1 != nullptr && L1 == L2;
+}
+
} // namespace
+
+namespace llvm {
+
+bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
+ llvm::MachineBasicBlock::iterator MI) {
+ const TargetRegisterInfo *TRI =
+ MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+ for (auto It = MI; It != MBB->end(); ++It) {
+ const MachineInstr &CurMI = *It;
+ // Hit use of scc, it is live.
+ if (CurMI.readsRegister(AMDGPU::SCC, TRI))
+ return true;
+ // Hit def of scc first, not live.
+ if (CurMI.definesRegister(AMDGPU::SCC, TRI))
+ return false;
+ }
+ // Reach the end of MBB, check live-ins of MBB successors.
+ for (const MachineBasicBlock *Succ : MBB->successors()) {
+ if (Succ->isLiveIn(AMDGPU::SCC))
+ return true;
+ }
+ return false;
+}
+
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
+ MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
+ const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
+ MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
+ // If SCC is dead at MI when we can use MI as the insert point.
+ if (!llvm::isSccLiveAt(MBB, MI)) {
+ return MI;
+ }
+
+ const bool CheckForExecWrite =
+ Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+
+ // Get the starting reverse iterator taking care to handle the MBB->end()
+ // case.
+ MachineBasicBlock::reverse_iterator Start;
+ if (MI == MBB->end()) {
+ Start = MBB->rbegin();
+ } else {
+ Start = MI.getReverse();
+ }
+
+ // Otherwise, walk backwards through the block looking for a location where
+ // SCC is dead.
+ for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend();
+ It != End; ++It) {
+ // If the instruction modifies exec then we cannot use it as
+ // an insertion point (if that is a constraint from the caller).
+ // The check for EXEC works for both wave64 and wave32 because
+ // it will also catch Writes to the subregisters (e.g. exec_lo).
+ if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) {
+ break;
+ }
+
+ if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
+ !It->readsRegister(AMDGPU::SCC, TRI)) {
+ return It->getIterator();
+ }
+ }
+
+ // If no safe location can be found in the block we can save and restore
+ // SCC around MI. There is no way to directly read or Write SCC so we use
+ // s_cselect to read the current value of SCC and s_cmp to Write the saved
+ // value back to SCC.
+ //
+ // The generated code will look like this;
+ //
+ // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC
+ // <----- Newly created safe insert point.
+ // MI
+ // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC
+ //
+ Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+ .addImm(-1)
+ .addImm(0);
+ BuildMI(*MBB, std::next(MI->getIterator()), DL,
+ TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(TmpScc, RegState::Kill)
+ .addImm(0);
+
+ return MI;
+}
+
// In case like float4 v, v.x used and defined in one block, v.y used and define
// in another block, one live interval could touch more than one MBB.
// TouchedMBBSet is used for scheduling where local live interval could cross
// multiple regions, need to calculate livereg for each region inside touched
// MBB.
-bool llvm::isLocalLiveInterval(
+bool isLocalLiveInterval(
const LiveInterval &LI, SlotIndexes *Indexes,
SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
if (LI.hasSubRanges()) {
@@ -98,7 +217,7 @@ bool llvm::isLocalLiveInterval(
return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
}
-bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
+bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
if (LI.hasSubRanges()) {
for (const auto &S : LI.subranges()) {
if (!isLocalLiveRange(&S, Indexes))
@@ -108,7 +227,7 @@ bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
return isLocalLiveRange(&LI, Indexes);
}
-void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
+void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
dbgs() << "\n live set: \n";
for (auto It : LiveSet) {
@@ -121,7 +240,347 @@ void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
}
}
-namespace llvm {
+LaneBitmask getRegMask(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) {
+ // We don't rely on read-undef_ flag because in case of tentative schedule
+ // tracking it isn't set correctly yet. This works correctly however since
+ // use mask has been tracked before using LIS.
+ return MO.getSubReg() == 0
+ ? MRI.getMaxLaneMaskForVReg(MO.getReg())
+ : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(
+ MO.getSubReg());
+}
+
+struct Piece {
+ unsigned Reg;
+ unsigned Offset;
+ unsigned Size;
+ static SmallVector<Piece, 8> split(std::bitset<32> Mask) {
+
+ SmallVector<Piece, 8> Pieces;
+ Piece Piece = {0, 0, 0};
+ for (unsigned i = 0; i < 32; i++) {
+ if (Mask.test(i)) {
+ if (Piece.Size == 0)
+ Piece.Offset = i;
+
+ Piece.Size++;
+ // Make sure no piece bigger than 8.
+ if (Piece.Size == 8) {
+ Pieces.emplace_back(Piece);
+ Piece.Size = 0;
+ }
+ } else {
+ if (Piece.Size == 0) {
+ continue;
+ }
+ Pieces.emplace_back(Piece);
+ Piece.Size = 0;
+ }
+ }
+ return Pieces;
+ }
+};
+
+static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI,
+ const MachineRegisterInfo &MRI) {
+ const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+ const TargetRegisterClass *SubregRC =
+ SIRI->getSubRegisterClass(RC, AMDGPU::sub0);
+ return SubregRC->LaneMask.getNumLanes();
+}
+
+static std::vector<unsigned>
+getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI,
+ const TargetRegisterClass *RC,
+ LaneBitmask Mask) {
+ // TODO: this could replace the code it was copied from in SplitKit.cpp
+
+ // First pass: Try to find a perfectly matching subregister index.
+ // If none exists find the one covering the most lanemask bits.
+ SmallVector<unsigned, 8> PossibleIndexes;
+ unsigned BestIdx = 0;
+ const LaneBitmask Avoid = ~Mask;
+ {
+ unsigned BestCover = 0;
+ for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
+ // Is this index even compatible with the given class?
+ if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
+ continue;
+ LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+ // Early exit if we found a perfect match.
+ if (SubRegMask == Mask) {
+ BestIdx = Idx;
+ break;
+ }
+
+ // The index must not cover any lanes outside
+ if ((SubRegMask & Avoid).any())
+ continue;
+
+ unsigned PopCount = SubRegMask.getNumLanes();
+ PossibleIndexes.push_back(Idx);
+ if (PopCount > BestCover) {
+ BestCover = PopCount;
+ BestIdx = Idx;
+ }
+ }
+ }
+
+ // Abort if we cannot possibly implement the COPY with the given indexes.
+ if (BestIdx == 0) {
+ LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+ << TRI->getRegClassName(RC) << " mask "
+ << PrintLaneMask(Mask) << '\n');
+ assert(false && "Impossible to span reg class");
+ return std::vector<unsigned>();
+ }
+
+ std::vector<unsigned> Result;
+ Result.push_back(BestIdx);
+
+ // Greedy heuristic: Keep iterating keeping the best covering subreg index
+ // each time.
+ Mask &= ~(TRI->getSubRegIndexLaneMask(BestIdx));
+ while (Mask.any()) {
+ BestIdx = 0;
+ int BestCover = std::numeric_limits<int>::min();
+ for (unsigned Idx : PossibleIndexes) {
+ LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+ // Early exit if we found a perfect match.
+ if (SubRegMask == Mask) {
+ BestIdx = Idx;
+ break;
+ }
+
+ // Guaranteed above
+ assert((SubRegMask & Avoid).none());
+
+ // Try to cover as much of the remaining lanes as possible but as few of
+ // the already covered lanes as possible.
+ int Cover = (SubRegMask & Mask).getNumLanes() -
+ (SubRegMask & ~Mask).getNumLanes();
+ if (Cover > BestCover) {
+ BestCover = Cover;
+ BestIdx = Idx;
+ }
+ }
+
+ if (BestIdx == 0) {
+ LLVM_DEBUG(
+ dbgs() << "Unable to find minimal spanning sub register(s) for "
+ << TRI->getRegClassName(RC) << " mask " << PrintLaneMask(Mask)
+ << '\n');
+ assert(false && "Impossible to span reg class");
+ return std::vector<unsigned>();
+ }
+
+ Result.push_back(BestIdx);
+ Mask &= ~TRI->getSubRegIndexLaneMask(BestIdx);
+ }
+
+ return Result;
+}
+
+static void updateSubReg(MachineOperand &UseMO,
+ const llvm::TargetRegisterClass *NewRC,
+ unsigned Offset, const SIRegisterInfo *SIRI) {
+ unsigned Size = NewRC->getLaneMask().getNumLanes();
+ if (Size == 1) {
+ UseMO.setSubReg(0);
+ } else {
+ const uint32_t SubReg = UseMO.getSubReg();
+ LaneBitmask LaneMask = SIRI->getSubRegIndexLaneMask(SubReg);
+
+ unsigned Mask = LaneMask.getAsInteger() >> Offset;
+
+ unsigned NewSubReg = getMinimalSpanningSubRegIdxSetForLaneMask(
+ SIRI, NewRC, LaneBitmask(Mask))
+ .front();
+
+ UseMO.setSubReg(NewSubReg);
+ }
+}
+
+bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
+ MachineOperand &DstMO = MI.getOperand(0);
+ // Skip case when dst subReg not 0.
+ if (DstMO.getSubReg()) {
+ return false;
+ }
+ Register Reg = DstMO.getReg();
+
+ SmallVector<MachineOperand *, 2> UseMOs;
+ for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) {
+ UseMOs.emplace_back(&UseMO);
+ }
+
+ const llvm::TargetRegisterClass *NewRC =
+ SIRI->getRegClass(Desc.operands().front().RegClass);
+ if (!NewRC->isAllocatable()) {
+ if (SIRI->isSGPRClass(NewRC))
+ NewRC = SIRI->getSGPRClassForBitWidth(NewRC->MC->RegSizeInBits);
+ else if (SIRI->isVGPRClass(NewRC))
+ NewRC = SIRI->getVGPRClassForBitWidth(NewRC->MC->RegSizeInBits);
+ else
+ return false;
+
+ if (!NewRC->isAllocatable())
+ return false;
+ }
+
+ unsigned NumLanes = NewRC->getLaneMask().getNumLanes();
+ if (Offset > 0) {
+ // Update offset operand in MI.
+ MachineOperand *OffsetOp =
+ SIII->getNamedOperand(MI, AMDGPU::OpName::offset);
+
+ const uint32_t LaneSize = sizeof(uint32_t);
+ if (OffsetOp) {
+ if (OffsetOp->isImm()) {
+ assert(OffsetOp != nullptr);
+ int64_t Offset = OffsetOp->getImm();
+ Offset += Offset * LaneSize;
+ if (!SIII->isLegalMUBUFImmOffset(Offset)) {
+ return false;
+ }
+ OffsetOp->setImm(Offset);
+ } else {
+ return false;
+ }
+ } else {
+ OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+ if (OffsetOp) {
+ Register NewOffsetReg =
+ MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(),
+ SIII->get(AMDGPU::S_ADD_U32))
+ .addDef(NewOffsetReg)
+ .add(*OffsetOp)
+ .addImm(Offset * LaneSize);
+ MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
+ MachineBasicBlock::iterator InsertPoint =
+ llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
+ SIII, &MRI);
+ MI.getParent()->insert(InsertPoint, OffsetAddMI);
+ SIII->legalizeOperands(*OffsetAddMI);
+ OffsetOp->setReg(NewOffsetReg);
+ OffsetOp->setSubReg(0);
+ if (SlotIndexes)
+ SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI);
+ } else {
+ return false;
+ }
+ }
+ // Update subReg for users.
+ for (MachineOperand *UseMO : UseMOs) {
+ updateSubReg(*UseMO, NewRC, Offset, SIRI);
+ }
+ } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) {
+ // Clear subReg when it's a single 32-bit reg.
+ for (MachineOperand *UseMO : UseMOs) {
+ UseMO->setSubReg(0);
+ }
+ }
+
+ MI.setDesc(Desc);
+ // Mutate reg class of Reg.
+ MRI.setRegClass(Reg, NewRC);
+ return true;
+}
+
+bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ SlotIndexes *SlotIndexes) {
+ bool IsImm = false;
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+ Register Reg = MI.getOperand(0).getReg();
+ if (!MRI.getUniqueVRegDef(Reg))
+ return false;
+ LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI);
+ LaneBitmask UseMask;
+ for (MachineOperand &MO : MRI.use_operands(Reg)) {
+ UseMask |= llvm::getRegMask(MO, MRI);
+ }
+
+ const unsigned FullMask = DstMask.getAsInteger();
+ unsigned Mask = UseMask.getAsInteger();
+ if (Mask == FullMask)
+ return false;
+ // Split mask when there's gap. Then group mask to 2/4/8.
+ auto Pieces = Piece::split(std::bitset<32>(Mask));
+ // Now only support 1 piece.
+ if (Pieces.size() != 1)
+ return false;
+ auto Piece = Pieces[0];
+ if (Piece.Size > 8)
+ return false;
+
+ // TODO: enable offset support when IsImm is true.
+ // Now if break different test when mul LaneSize or not mul for the offset.
+ if (IsImm && Piece.Offset != 0)
+ return false;
+
+ const unsigned Num32BitLanes =
+ Piece.Size / getNumLanesIn32BitReg(Reg, SIRI, MRI);
+
+ switch (Num32BitLanes) {
+ default:
+ return false;
+ case 1:
+ return reduceChannel(Piece.Offset, MI,
+ SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
+ : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
+ MRI, SIRI, SIII, SlotIndexes);
+ case 2:
+ return reduceChannel(Piece.Offset, MI,
+ SIII->get(IsImm
+ ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
+ MRI, SIRI, SIII, SlotIndexes);
+ case 3:
+ if (FullMask == 0xff)
+ return false;
+ LLVM_FALLTHROUGH;
+ case 4:
+ return reduceChannel(Piece.Offset, MI,
+ SIII->get(IsImm
+ ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
+ MRI, SIRI, SIII, SlotIndexes);
+ case 5:
+ case 6:
+ case 7:
+ if (FullMask == 0xffff)
+ return false;
+ LLVM_FALLTHROUGH;
+ case 8:
+ return reduceChannel(Piece.Offset, MI,
+ SIII->get(IsImm
+ ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
+ MRI, SIRI, SIII, SlotIndexes);
+ }
+
+ } break;
+ }
+ return false;
+}
+
unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
const llvm::MachineRegisterInfo &MRI,
const llvm::SIRegisterInfo *SIRI) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index c4452c91a43a8..6b9079e5d65fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -24,6 +24,7 @@ class LiveInterval;
class SlotIndexes;
class MachineRegisterInfo;
class SIRegisterInfo;
+class SIInstrInfo;
class MachineDominatorTree;
class MachinePostDominatorTree;
@@ -45,6 +46,45 @@ bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
+bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
+ llvm::MachineBasicBlock::iterator MI);
+
+// An enum used to pass additional constraints to
+// `FindOrCreateInsertionPointForSccDef()`. This will further
+// constrain the location where the scc def can be inserted.
+enum SccDefInsertPointConstraintFlags {
+ None = 0, // No additional constraints.
+ NoExecWrite = 1, // Should be no modification of exec between BeforeInst and
+ // insert point.
+};
+
+// Look for a safe place to insert an instruction that defines scc.
+//
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
+ llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst,
+ const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII,
+ llvm::MachineRegisterInfo *MRI,
+ SccDefInsertPointConstraintFlags Constraints =
+ SccDefInsertPointConstraintFlags::None);
+
+// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only
+// used 4 lanes.
+bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *TRI,
+ const llvm::SIInstrInfo *TII,
+ llvm::SlotIndexes *SlotIndexes);
+
unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
const llvm::MachineRegisterInfo &MRI,
const llvm::SIRegisterInfo *SIRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index 32301130606a7..c2dbf1a8b297e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -12,7 +12,158 @@
//
//==------------------------------------------------------------------------==//
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+#include <cmath>
+
namespace llvm {
+
+void SchedScore::sum(const SchedScore &S, unsigned LoopDepth) {
+ unsigned LoopCount = LoopDepth > 0 ? std::pow(3, LoopDepth) : 1;
+ LatencyHide += LoopCount * S.LatencyHide;
+ MemLatency += LoopCount * S.MemLatency;
+ MixAlu += LoopCount * S.MixAlu;
+ Alu += LoopCount * S.Alu;
+ Lds += LoopCount * S.Lds;
+ SgprSpill |= S.SgprSpill;
+}
+// Does more occupancy give more perf.
+bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
+ unsigned Gain = latencyGain(TargetOccupancy, ExtraOcc);
+ // 10% is good enough.
+ if ((10 * Gain) >= Alu)
+ return true;
+ return false;
+}
+
+unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
+ unsigned Latency = MemLatency;
+ return (Latency / (TgtOcc)) - (Latency / (TgtOcc + ExtraOcc));
+}
+
+// AMDGPULatencyTracker
+AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
+ : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
+
+void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
+ if (MI.isDebugInstr())
+ return;
+ int Latency = SIII->getInstrLatency(ItinerayData, MI);
+ // If inside latency hide.
+ if (!LatencyMIs.empty()) {
+ bool IsWaitCnt = false;
+ for (auto &MO : MI.operands()) {
+ if (MO.isReg()) {
+ Register Reg = MO.getReg();
+ auto It = LatencyMIs.find(Reg);
+ if (It != LatencyMIs.end()) {
+ IsWaitCnt = true;
+ // If MI use mem result, update latency to mem latency.
+ int Cycle = It->second;
+ if (Cycle > Latency)
+ Latency = Cycle;
+ }
+ }
+ }
+ // Update latency for each mem latency inst.
+ for (auto It = LatencyMIs.begin(); It != LatencyMIs.end();) {
+ auto Prev = It;
+ auto L = (It++);
+ int Cycle = L->second;
+ if (Cycle <= Latency) {
+ // Only left cycles.
+ // Remove the reg.
+ LatencyMIs.erase(Prev);
+ if (IsWaitCnt && Cycle == Latency) {
+ Score.MemLatency += Cycle;
+ // Only count memLatency once, the rest is hide.
+ IsWaitCnt = false;
+ } else {
+ // Hide cycle or count mem latency?
+ Score.LatencyHide += Cycle;
+ }
+ } else {
+ L->second -= Latency;
+ // Hide latency.
+ Score.LatencyHide += Latency;
+ }
+ }
+
+ } else {
+ // TODO: check branch/lds?
+ // TODO: check prevVAlu?
+ auto GetAluStatus = [](const MachineInstr &MI,
+ const llvm::SIInstrInfo *SIII) {
+ AluStatus Status = AluStatus::Nothing;
+ if (SIII->isVALU(MI.getOpcode())) {
+ Status = AluStatus::Vector;
+ } else if (SIII->isSALU(MI.getOpcode())) {
+ Status = AluStatus::Scalar;
+ }
+ return Status;
+ };
+ AluStatus Status = GetAluStatus(MI, SIII);
+
+ switch (PrevStatus) {
+ case AluStatus::Nothing: {
+ Score.Alu += Latency;
+ Score.MixAlu += Latency;
+ PrevStatus = Status;
+ } break;
+ case AluStatus::Vector:
+ case AluStatus::Scalar: {
+ Score.Alu += Latency;
+ // Ignore mix alu.
+ if (PrevStatus != Status) {
+ PrevStatus = AluStatus::Nothing;
+ } else {
+ Score.MixAlu += Latency;
+ }
+ } break;
+ }
+ }
+ // Update latency inst.
+ if (SIII->isHighLatencyDef(MI.getOpcode()) && MI.mayLoad()) {
+ Register Reg = MI.getOperand(0).getReg();
+ // TODO: get correct latency.
+ // SIII->getInstrLatency(ItinerayData, MI);
+ constexpr unsigned kHighLetency = 180;
+ LatencyMIs[Reg] = kHighLetency;
+ } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) {
+ Register Reg = MI.getOperand(0).getReg();
+ // TODO: get correct latency.
+ // SIII->getInstrLatency(ItinerayData, MI);
+ constexpr unsigned kLowLetency = 35;
+ LatencyMIs[Reg] = kLowLetency;
+ }
}
+SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
+ const llvm::MachineLoopInfo *MLI) {
+ SchedScore TotalScore;
+ for (auto &MFI : MF) {
+ MachineBasicBlock &MBB = MFI;
+ MachineBasicBlock::iterator Next;
+ AMDGPULatencyTracker LatencyTracker(ST);
+ for (auto &MI : MBB) {
+ LatencyTracker.scan(MI);
+ }
+ unsigned LoopDepth = 0;
+ if (MLI) {
+ LoopDepth = MLI->getLoopDepth(&MBB);
+ }
+ TotalScore.sum(LatencyTracker.Score, LoopDepth);
+ }
+ return TotalScore;
+}
+
+} // namespace llvm
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index f9be0a2c73d86..b513e7335ffe4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -15,11 +15,16 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/ADT/DenseMap.h"
+
namespace llvm {
+class MachineInstr;
class MachineFunction;
class GCNSubtarget;
class MachineLoopInfo;
+class SIInstrInfo;
struct SchedScore {
// Score for this Sched result.
@@ -45,6 +50,28 @@ struct SchedScore {
unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
};
+struct AMDGPULatencyTracker {
+ AMDGPULatencyTracker(const llvm::GCNSubtarget &ST);
+ const llvm::SIInstrInfo *SIII;
+ const llvm::InstrItineraryData *ItinerayData;
+ // Latency MI dst reg to cycle map.
+ llvm::DenseMap<unsigned, int> LatencyMIs;
+ SchedScore Score;
+ // Low latency MI not wait.
+ unsigned HideLatency = 0;
+ unsigned MemLatency = 0;
+ // For simple, only consider mixture as one valu one salu.
+ // Not group now.
+ unsigned PrevSAlu = 0;
+ unsigned PrevVAlu = 0;
+ enum class AluStatus {
+ Nothing,
+ Vector,
+ Scalar,
+ } PrevStatus = AluStatus::Nothing;
+ void scan(const llvm::MachineInstr &MI);
+};
+
SchedScore collectLatency(llvm::MachineFunction &MF,
const llvm::GCNSubtarget &ST,
const llvm::MachineLoopInfo *MLI = nullptr);
>From 3c2b1f3acd43503c7f90781784687cd473af09fc Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Mon, 21 Apr 2025 15:29:26 -0700
Subject: [PATCH 3/6] Tests
---
.../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 13 +-
.../CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir | 565 ++++++++++++++++++
.../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 452 ++++++++++++++
3 files changed, 1029 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 95237062a6093..5c628a89766c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -31,6 +31,8 @@
using namespace llvm;
+static cl::opt<bool>
+ EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
namespace {
@@ -723,6 +725,12 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
if (IsSingleDef) {
// The reg might share with other candidates, check It here.
// Count share reg in getReducedSize.
+ if (EnableAggressive) {
+ // In case of aggressive remat, treat multi use reg as shared reg and
+ // ignore size of shared reg.
+ if (!MRI.hasOneNonDBGUse(Reg))
+ continue;
+ }
const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
if (unsigned SubIdx = MO.getSubReg()) {
if (OpRC)
@@ -1253,6 +1261,9 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop
unsigned SLimit = Status.TargetSLimit;
int RematSCnt = Status.MaxSPressure - SLimit;
+ // when agressive sgpr remat, reserve some for allocation lost.
+ if (EnableAggressive)
+ RematSCnt += NearTargetRegLimit;
bool IsSGPRSpill = false;
if (RematSCnt > 0) {
@@ -1367,7 +1378,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop
for (RematNode &Node : SRematList) {
SRematMap[Node.Reg] = Node;
RematCnt += Node.Size;
- if (RematCnt > RematSCnt)
+ if (RematCnt > RematSCnt && !EnableAggressive)
break;
}
NewRematSCnt = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
new file mode 100644
index 0000000000000..02a9836313360
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
@@ -0,0 +1,565 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-remat-enable-hot-block-remat-aggressive -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that the buffer loads have been moved to the use and the lanes are reduced
+# correctly.
+#
+# CHECK: bb.2:
+#==========================================================================
+# X4_IMM, Using .x
+# CHECK: %[[#reg0:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 4, 0
+# X4_IMM, Using .xy
+# CHECK: %[[#reg1:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub0, %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub1, %{{.+}}, 20, 0
+# X4_IMM, Using .xyz
+# CHECK: %[[#reg2:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub0, %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub1, %{{.+}}, 36, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub2, %{{.+}}, 40, 0
+# X4_IMM, Using .yz
+# CHECK: %[[#reg3:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub1, %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub2, %{{.+}}, 52, 0
+# X4_IMM, Using .yzw
+# CHECK: %[[#reg4:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub1, %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub2, %{{.+}}, 68, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub3, %{{.+}}, 72, 0
+#==========================================================================
+# X8_IMM, Using .x
+# CHECK: %[[#reg5:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 84, 0
+# X8_IMM, Using .xy
+# CHECK: %[[#reg6:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub0, %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub1, %{{.+}}, 100, 0
+# X8_IMM, Using .xyz
+# CHECK: %[[#reg7:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub0, %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub1, %{{.+}}, 116, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub2, %{{.+}}, 120, 0
+# X8_IMM, Using .xyzw
+# CHECK: %[[#reg8:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub0, %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub1, %{{.+}}, 132, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub2, %{{.+}}, 136, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub3, %{{.+}}, 140, 0
+# X8_IMM, Using .xyzw + 5th dword
+# CHECK: %[[#reg9:]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub0, %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub1, %{{.+}}, 148, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub2, %{{.+}}, 152, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub3, %{{.+}}, 156, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub4, %{{.+}}, 160, 0
+#==========================================================================
+# X16_IMM, Using .xy and .zw
+# CHECK: %[[#reg10:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub0_sub1, %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub2_sub3, %{{.+}}, 164, 0
+#==========================================================================
+# X4_SGPR, Using .x
+# CHECK: %[[#reg11:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 176, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 180, 0
+# X8_SGPR, Using .xy
+# CHECK: %[[#reg12:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub0, %{{.+}}, 192, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub1, %{{.+}}, 196, 0
+# X16_SGPR, Using .xy + .zw
+# CHECK: %[[#reg13:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub0_sub1, %{{.+}}, 208, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub2_sub3, %{{.+}}, 216, 0
+#==========================================================================
+#
+#
+# CHECK: %[[#reg14:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
+# CHECK: %[[#reg15:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
+# CHECK: %[[#reg16:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
+# CHECK: %[[#reg17:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
+# CHECK: %[[#reg18:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
+# CHECK: %[[#reg19:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
+# CHECK: %[[#reg20:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
+# CHECK: %[[#reg21:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
+# CHECK: %[[#reg22:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
+# CHECK: %[[#reg23:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
+# CHECK: %[[#reg24:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
+# CHECK: %[[#reg25:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
+# CHECK: %[[#reg26:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
+# CHECK: %[[#reg27:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
+# CHECK: %[[#reg28:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
+# CHECK: %[[#reg29:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
+# CHECK: %[[#reg30:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
+# CHECK: %[[#reg31:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
+# CHECK: %[[#reg32:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
+# CHECK: %[[#reg33:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
+# CHECK: %[[#reg34:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
+# CHECK: %[[#reg35:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
+# CHECK: %[[#reg36:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
+# CHECK: %[[#reg37:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
+# CHECK: %[[#reg38:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
+# CHECK: %[[#reg39:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
+# CHECK: %[[#reg40:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
+# CHECK: %[[#reg41:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
+# CHECK: %[[#reg42:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
+# CHECK: %[[#reg43:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
+# CHECK: %[[#reg44:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
+# CHECK: %[[#reg45:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
+# CHECK: %[[#reg46:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
+# CHECK: %[[#reg47:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
+# CHECK: %[[#reg48:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
+# CHECK: %[[#reg49:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
+# CHECK: %[[#reg50:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
+# CHECK: %[[#reg51:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
+# CHECK: %[[#reg52:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
+# CHECK: %[[#reg53:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
+# CHECK: %[[#reg54:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
+# CHECK: %[[#reg55:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
+# CHECK: %[[#reg56:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
+# CHECK: %[[#reg57:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
+# CHECK: %[[#reg58:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
+# CHECK: %[[#reg59:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
+# CHECK: %[[#reg60:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
+# CHECK: %[[#reg61:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
+# CHECK: %[[#reg62:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
+# CHECK: %[[#reg63:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+
+
+--- |
+ source_filename = ".\main.ll"
+ define amdgpu_ps void @main() #1 {
+ ret void
+ }
+ attributes #1 = { "target-cpu"="gfx1010" }
+ !llvm.ident = !{!0}
+ !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name: main
+tracksRegLiveness: true
+liveins:
+ - { reg: '$sgpr0' }
+ - { reg: '$sgpr1' }
+ - { reg: '$sgpr2' }
+ - { reg: '$sgpr3' }
+ - { reg: '$sgpr4' }
+ - { reg: '$sgpr5' }
+ - { reg: '$sgpr6' }
+ - { reg: '$sgpr7' }
+ - { reg: '$sgpr8' }
+ - { reg: '$sgpr8' }
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1
+
+ %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+ %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+ %2:sgpr_128 = REG_SEQUENCE $sgpr8, %subreg.sub0, $sgpr9, %subreg.sub1, $sgpr10, %subreg.sub2, $sgpr11, %subreg.sub3
+
+ ; X4_IMM
+ %3000:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 0, 0
+ %3001:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 16, 0
+ %3002:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 32, 0
+ %3003:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 48, 0
+ %3004:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 64, 0
+
+ ; X8_IMM
+ %3005:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 80, 0
+ %3006:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 96, 0
+ %3007:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 112, 0
+ %3008:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 128, 0
+ %3009:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 144, 0
+
+ ; X16_IMM
+ %30010:sgpr_512 = S_BUFFER_LOAD_DWORDX16_IMM %2:sgpr_128, 160, 0
+
+ ; X4_SGPR
+ %50:sgpr_32 = COPY $sgpr0
+ %30011:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %2:sgpr_128, %50, 0
+
+ ; X8_SGPR
+ %51:sgpr_32 = COPY $sgpr1
+ %30012:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR %2:sgpr_128, %51, 0
+
+ ; X16_SGPR
+ %52:sgpr_32 = COPY $sgpr2
+ %30013:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR %2:sgpr_128, %52, 0
+
+ %30014:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 224, 0
+ %30015:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 240, 0
+ %30016:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 256, 0
+ %30017:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 272, 0
+ %30018:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 288, 0
+ %30019:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 304, 0
+ %30020:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 320, 0
+ %30021:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 336, 0
+ %30022:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 352, 0
+ %30023:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 368, 0
+ %30024:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 384, 0
+ %30025:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 400, 0
+ %30026:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 416, 0
+ %30027:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 432, 0
+ %30028:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 448, 0
+ %30029:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 464, 0
+ %30030:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 480, 0
+ %30031:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 496, 0
+ %30032:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 512, 0
+ %30033:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 528, 0
+ %30034:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 544, 0
+ %30035:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 560, 0
+ %30036:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 576, 0
+ %30037:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 592, 0
+ %30038:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 608, 0
+ %30039:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 624, 0
+ %30040:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 640, 0
+ %30041:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 656, 0
+ %30042:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 672, 0
+ %30043:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 688, 0
+ %30044:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 704, 0
+ %30045:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 720, 0
+ %30046:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 736, 0
+ %30047:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 752, 0
+ %30048:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 768, 0
+ %30049:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 784, 0
+ %30050:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 800, 0
+ %30051:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 816, 0
+ %30052:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 832, 0
+ %30053:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 848, 0
+ %30054:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 864, 0
+ %30055:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 880, 0
+ %30056:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 896, 0
+ %30057:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 912, 0
+ %30058:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 928, 0
+ %30059:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 944, 0
+ %30060:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 960, 0
+ %30061:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 976, 0
+ %30062:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 992, 0
+ %30063:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 1008, 0
+
+ %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+
+ %8000:vgpr_32 = IMPLICIT_DEF
+ %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+ $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+ %8001:vgpr_32 = COPY %8000
+ S_BRANCH %bb.2
+
+ bb.2:
+
+ %3:vgpr_32 = IMPLICIT_DEF
+ ;==========================================================================
+ ; X4_IMM, Using .x
+ S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 0, 0
+ S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 4, 0 ; Do it a second time, since the lane reduction triggers on clone, and clone only happens when there are multiple uses.
+
+ ; X4_IMM, Using .xy
+ S_BUFFER_STORE_DWORD_IMM %3001.sub0, %1:sgpr_128, 16, 0
+ S_BUFFER_STORE_DWORD_IMM %3001.sub1, %1:sgpr_128, 20, 0
+
+ ; X4_IMM, Using .xyz
+ S_BUFFER_STORE_DWORD_IMM %3002.sub0, %1:sgpr_128, 32, 0
+ S_BUFFER_STORE_DWORD_IMM %3002.sub1, %1:sgpr_128, 36, 0
+ S_BUFFER_STORE_DWORD_IMM %3002.sub2, %1:sgpr_128, 40, 0
+
+ ; X4_IMM, Using .yz
+ S_BUFFER_STORE_DWORD_IMM %3003.sub1, %1:sgpr_128, 48, 0
+ S_BUFFER_STORE_DWORD_IMM %3003.sub2, %1:sgpr_128, 52, 0
+
+ ; X4_IMM, Using .yzw
+ S_BUFFER_STORE_DWORD_IMM %3004.sub1, %1:sgpr_128, 64, 0
+ S_BUFFER_STORE_DWORD_IMM %3004.sub2, %1:sgpr_128, 68, 0
+ S_BUFFER_STORE_DWORD_IMM %3004.sub3, %1:sgpr_128, 72, 0
+
+ ;==========================================================================
+ ; X8_IMM, Using .x
+ S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 80, 0
+ S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 84, 0
+
+ ; X8_IMM, Using .xy
+ S_BUFFER_STORE_DWORD_IMM %3006.sub0, %1:sgpr_128, 96, 0
+ S_BUFFER_STORE_DWORD_IMM %3006.sub1, %1:sgpr_128, 100, 0
+
+ ; X8_IMM, Using .xyz
+ S_BUFFER_STORE_DWORD_IMM %3007.sub0, %1:sgpr_128, 112, 0
+ S_BUFFER_STORE_DWORD_IMM %3007.sub1, %1:sgpr_128, 116, 0
+ S_BUFFER_STORE_DWORD_IMM %3007.sub2, %1:sgpr_128, 120, 0
+
+ ; X8_IMM, Using .xyzw
+ S_BUFFER_STORE_DWORD_IMM %3008.sub0, %1:sgpr_128, 128, 0
+ S_BUFFER_STORE_DWORD_IMM %3008.sub1, %1:sgpr_128, 132, 0
+ S_BUFFER_STORE_DWORD_IMM %3008.sub2, %1:sgpr_128, 136, 0
+ S_BUFFER_STORE_DWORD_IMM %3008.sub3, %1:sgpr_128, 140, 0
+
+ ; X8_IMM, Using .xyzw + 5th dword
+ S_BUFFER_STORE_DWORD_IMM %3009.sub0, %1:sgpr_128, 144, 0
+ S_BUFFER_STORE_DWORD_IMM %3009.sub1, %1:sgpr_128, 148, 0
+ S_BUFFER_STORE_DWORD_IMM %3009.sub2, %1:sgpr_128, 152, 0
+ S_BUFFER_STORE_DWORD_IMM %3009.sub3, %1:sgpr_128, 156, 0
+ S_BUFFER_STORE_DWORD_IMM %3009.sub4, %1:sgpr_128, 160, 0
+
+ ;==========================================================================
+ ; X16_IMM, Using .xy and .zw
+ S_BUFFER_STORE_DWORDX2_IMM %30010.sub0_sub1, %1:sgpr_128, 160, 0
+ S_BUFFER_STORE_DWORDX2_IMM %30010.sub2_sub3, %1:sgpr_128, 164, 0
+
+ ;==========================================================================
+ ; X4_SGPR, Using .x
+ S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 176, 0
+ S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 180, 0
+
+ ; X8_SGPR, Using .xy
+ S_BUFFER_STORE_DWORD_IMM %30012.sub0, %1:sgpr_128, 192, 0
+ S_BUFFER_STORE_DWORD_IMM %30012.sub1, %1:sgpr_128, 196, 0
+
+ ; X16_SGPR, Using .xy + .zw
+ S_BUFFER_STORE_DWORDX2_IMM %30013.sub0_sub1, %1:sgpr_128, 208, 0
+ S_BUFFER_STORE_DWORDX2_IMM %30013.sub2_sub3, %1:sgpr_128, 216, 0
+
+ ;==========================================================================
+ S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
+
+ EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+ S_ENDPGM 0
+...
+
+
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
new file mode 100644
index 0000000000000..69875261b74e9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -0,0 +1,452 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that the loads have been moved to the use
+# CHECK: bb.2:
+# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
+# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0
+# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0
+# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0
+# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0
+# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0
+# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0
+# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0
+# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0
+# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0
+# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0
+# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0
+# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0
+# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0
+# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
+# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
+# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
+# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
+# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
+# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
+# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
+# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
+# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
+# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
+# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
+# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
+# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
+# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
+# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
+# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
+# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
+# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
+# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
+# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
+# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
+# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
+# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
+# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
+# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
+# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
+# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
+# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
+# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
+# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
+# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
+# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
+# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
+# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
+# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
+# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
+# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
+# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
+# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
+# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
+# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
+# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
+# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
+# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
+# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
+# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
+# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
+# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
+# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
+# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+
+
+--- |
+ source_filename = ".\main.ll"
+ define amdgpu_ps void @main() #1 {
+ ret void
+ }
+ attributes #1 = { "target-cpu"="gfx1010" }
+ !llvm.ident = !{!0}
+ !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name: main
+tracksRegLiveness: true
+liveins:
+ - { reg: '$sgpr0' }
+ - { reg: '$sgpr1' }
+ - { reg: '$sgpr2' }
+ - { reg: '$sgpr3' }
+ - { reg: '$sgpr4' }
+ - { reg: '$sgpr5' }
+ - { reg: '$sgpr6' }
+ - { reg: '$sgpr7' }
+ - { reg: '$sgpr8' }
+ - { reg: '$sgpr8' }
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+ %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+ ; undef %0.sub0:sgpr_64 = COPY $sgpr0
+ ; undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+ %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+ ; undef %1.sub0:sgpr_128 = COPY $sgpr4
+ ; undef %1.sub1:sgpr_128 = COPY $sgpr5
+ ; undef %1.sub2:sgpr_128 = COPY $sgpr6
+ ; undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+ %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0
+ %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0
+ %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0
+ %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0
+ %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0
+ %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0
+ %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0
+ %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0
+ %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0
+ %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0
+ %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0
+ %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0
+ %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0
+ %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0
+ %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0
+ %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0
+ %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0
+ %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0
+ %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0
+ %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0
+ %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0
+ %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0
+ %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0
+ %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0
+ %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0
+ %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0
+ %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0
+ %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0
+ %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0
+ %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0
+ %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0
+ %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0
+ %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0
+ %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0
+ %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0
+ %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0
+ %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0
+ %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0
+ %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0
+ %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0
+ %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0
+ %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0
+ %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0
+ %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0
+ %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0
+ %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0
+ %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0
+ %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0
+ %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0
+ %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0
+ %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0
+ %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0
+ %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0
+ %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0
+ %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0
+ %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0
+ %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0
+ %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0
+ %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0
+ %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0
+ %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0
+ %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0
+ %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0
+ %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0
+
+ %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+
+ %8000:vgpr_32 = IMPLICIT_DEF
+ %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+ $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+ %8001:vgpr_32 = COPY %8000
+ S_BRANCH %bb.2
+
+ bb.2:
+
+ %3:vgpr_32 = IMPLICIT_DEF
+ S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
+
+ EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+ S_ENDPGM 0
+...
>From dbdc9a48b78f7cc97f25d7e0195d1e5423d69265 Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Mon, 21 Apr 2025 15:59:28 -0700
Subject: [PATCH 4/6] Added test for the phi crash in pressure tracker
---
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 18 +-
llvm/test/CodeGen/AMDGPU/remat/phi.mir | 607 ++++++++++++++++++++++
2 files changed, 618 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi.mir
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index f74d12cfab0c0..7f76d14eb9ab0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -549,22 +549,26 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
if (!S.liveAt(SI)) {
if (It == LiveRegs.end()) {
It = LiveRegs.find(MO.getReg());
- if (It == LiveRegs.end())
+ if (!MRI->isSSA() && It == LiveRegs.end())
llvm_unreachable("register isn't live");
}
- auto PrevMask = It->second;
- It->second &= ~S.LaneMask;
- CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
+ if (It != LiveRegs.end()) {
+ auto PrevMask = It->second;
+ It->second &= ~S.LaneMask;
+ CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
+ }
}
}
if (It != LiveRegs.end() && It->second.none())
LiveRegs.erase(It);
} else if (!LI.liveAt(SI)) {
auto It = LiveRegs.find(MO.getReg());
- if (It == LiveRegs.end())
+ if (!MRI->isSSA() && It == LiveRegs.end())
llvm_unreachable("register isn't live");
- CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
- LiveRegs.erase(It);
+ if (It != LiveRegs.end()) {
+ CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
+ LiveRegs.erase(It);
+ }
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi.mir b/llvm/test/CodeGen/AMDGPU/remat/phi.mir
new file mode 100644
index 0000000000000..2d22e9fba2593
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/phi.mir
@@ -0,0 +1,607 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -amdgpu-remat-enable-hot-block-remat-aggressive -run-pass=amdgpu-hot-block-remat -o - | FileCheck %s
+
+# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are
+# present.
+
+# CHECK: S_ENDPGM
+
+--- |
+ source_filename = ".\main.ll"
+ define amdgpu_ps void @main() #1 {
+ ret void
+ }
+ attributes #1 = { "target-cpu"="gfx1010" }
+ !llvm.ident = !{!0}
+ !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name: main
+tracksRegLiveness: true
+liveins:
+ - { reg: '$sgpr0' }
+ - { reg: '$sgpr1' }
+ - { reg: '$sgpr2' }
+ - { reg: '$sgpr3' }
+ - { reg: '$sgpr4' }
+ - { reg: '$sgpr5' }
+ - { reg: '$sgpr6' }
+ - { reg: '$sgpr7' }
+ - { reg: '$sgpr8' }
+ - { reg: '$sgpr8' }
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+ %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+ ; undef %0.sub0:sgpr_64 = COPY $sgpr0
+ ; undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+ %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+ ; undef %1.sub0:sgpr_128 = COPY $sgpr4
+ ; undef %1.sub1:sgpr_128 = COPY $sgpr5
+ ; undef %1.sub2:sgpr_128 = COPY $sgpr6
+ ; undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+ %2000:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2001:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2002:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2003:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2004:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2005:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2006:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2007:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2008:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2009:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2010:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2011:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2012:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2013:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2014:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2015:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2016:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2017:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2018:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2019:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2020:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2021:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2022:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2023:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2024:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2025:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2026:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2027:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2028:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2029:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2030:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2031:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2032:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2033:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2034:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2035:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2036:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2037:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2038:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2039:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2040:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2041:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2042:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2043:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2044:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2045:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2046:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2047:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2048:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2049:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2050:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2051:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2052:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2053:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2054:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2055:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2056:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2057:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2058:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2059:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2060:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2061:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2062:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2063:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2064:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2065:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2066:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2067:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2068:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2069:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2070:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2071:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2072:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2073:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2074:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2075:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2076:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2077:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2078:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2079:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2080:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2081:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2082:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2083:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2084:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2085:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2086:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2087:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2088:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2089:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2090:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2091:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2092:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2093:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2094:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2095:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2096:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2097:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2098:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %2099:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %3000:sgpr_32 = S_MOV_B32 0
+ %3001:sgpr_32 = S_MOV_B32 1
+ %3002:sgpr_32 = S_MOV_B32 2
+ %3003:sgpr_32 = S_MOV_B32 3
+ %3004:sgpr_32 = S_MOV_B32 4
+ %3005:sgpr_32 = S_MOV_B32 5
+ %3006:sgpr_32 = S_MOV_B32 6
+ %3007:sgpr_32 = S_MOV_B32 7
+ %3008:sgpr_32 = S_MOV_B32 8
+ %3009:sgpr_32 = S_MOV_B32 9
+ %3010:sgpr_32 = S_MOV_B32 10
+ %3011:sgpr_32 = S_MOV_B32 11
+ %3012:sgpr_32 = S_MOV_B32 12
+ %3013:sgpr_32 = S_MOV_B32 13
+ %3014:sgpr_32 = S_MOV_B32 14
+ %3015:sgpr_32 = S_MOV_B32 15
+ %3016:sgpr_32 = S_MOV_B32 16
+ %3017:sgpr_32 = S_MOV_B32 17
+ %3018:sgpr_32 = S_MOV_B32 18
+ %3019:sgpr_32 = S_MOV_B32 19
+ %3020:sgpr_32 = S_MOV_B32 20
+ %3021:sgpr_32 = S_MOV_B32 21
+ %3022:sgpr_32 = S_MOV_B32 22
+ %3023:sgpr_32 = S_MOV_B32 23
+ %3024:sgpr_32 = S_MOV_B32 24
+ %3025:sgpr_32 = S_MOV_B32 25
+ %3026:sgpr_32 = S_MOV_B32 26
+ %3027:sgpr_32 = S_MOV_B32 27
+ %3028:sgpr_32 = S_MOV_B32 28
+ %3029:sgpr_32 = S_MOV_B32 29
+ %3030:sgpr_32 = S_MOV_B32 30
+ %3031:sgpr_32 = S_MOV_B32 31
+ %3032:sgpr_32 = S_MOV_B32 32
+ %3033:sgpr_32 = S_MOV_B32 33
+ %3034:sgpr_32 = S_MOV_B32 34
+ %3035:sgpr_32 = S_MOV_B32 35
+ %3036:sgpr_32 = S_MOV_B32 36
+ %3037:sgpr_32 = S_MOV_B32 37
+ %3038:sgpr_32 = S_MOV_B32 38
+ %3039:sgpr_32 = S_MOV_B32 39
+ %3040:sgpr_32 = S_MOV_B32 40
+ %3041:sgpr_32 = S_MOV_B32 41
+ %3042:sgpr_32 = S_MOV_B32 42
+ %3043:sgpr_32 = S_MOV_B32 43
+ %3044:sgpr_32 = S_MOV_B32 44
+ %3045:sgpr_32 = S_MOV_B32 45
+ %3046:sgpr_32 = S_MOV_B32 46
+ %3047:sgpr_32 = S_MOV_B32 47
+ %3048:sgpr_32 = S_MOV_B32 48
+ %3049:sgpr_32 = S_MOV_B32 49
+ %3050:sgpr_32 = S_MOV_B32 50
+ %3051:sgpr_32 = S_MOV_B32 51
+ %3052:sgpr_32 = S_MOV_B32 52
+ %3053:sgpr_32 = S_MOV_B32 53
+ %3054:sgpr_32 = S_MOV_B32 54
+ %3055:sgpr_32 = S_MOV_B32 55
+ %3056:sgpr_32 = S_MOV_B32 56
+ %3057:sgpr_32 = S_MOV_B32 57
+ %3058:sgpr_32 = S_MOV_B32 58
+ %3059:sgpr_32 = S_MOV_B32 59
+ %3060:sgpr_32 = S_MOV_B32 60
+ %3061:sgpr_32 = S_MOV_B32 61
+ %3062:sgpr_32 = S_MOV_B32 62
+ %3063:sgpr_32 = S_MOV_B32 63
+ %3064:sgpr_32 = S_MOV_B32 64
+ %3065:sgpr_32 = S_MOV_B32 65
+ %3066:sgpr_32 = S_MOV_B32 66
+ %3067:sgpr_32 = S_MOV_B32 67
+ %3068:sgpr_32 = S_MOV_B32 68
+ %3069:sgpr_32 = S_MOV_B32 69
+ %3070:sgpr_32 = S_MOV_B32 70
+ %3071:sgpr_32 = S_MOV_B32 71
+ %3072:sgpr_32 = S_MOV_B32 72
+ %3073:sgpr_32 = S_MOV_B32 73
+ %3074:sgpr_32 = S_MOV_B32 74
+ %3075:sgpr_32 = S_MOV_B32 75
+ %3076:sgpr_32 = S_MOV_B32 76
+ %3077:sgpr_32 = S_MOV_B32 77
+ %3078:sgpr_32 = S_MOV_B32 78
+ %3079:sgpr_32 = S_MOV_B32 79
+ %3080:sgpr_32 = S_MOV_B32 80
+ %3081:sgpr_32 = S_MOV_B32 81
+ %3082:sgpr_32 = S_MOV_B32 82
+ %3083:sgpr_32 = S_MOV_B32 83
+ %3084:sgpr_32 = S_MOV_B32 84
+ %3085:sgpr_32 = S_MOV_B32 85
+ %3086:sgpr_32 = S_MOV_B32 86
+ %3087:sgpr_32 = S_MOV_B32 87
+ %3088:sgpr_32 = S_MOV_B32 88
+ %3089:sgpr_32 = S_MOV_B32 89
+ %3090:sgpr_32 = S_MOV_B32 90
+ %3091:sgpr_32 = S_MOV_B32 91
+ %3092:sgpr_32 = S_MOV_B32 92
+ %3093:sgpr_32 = S_MOV_B32 93
+ %3094:sgpr_32 = S_MOV_B32 94
+ %3095:sgpr_32 = S_MOV_B32 95
+ %3096:sgpr_32 = S_MOV_B32 96
+ %3097:sgpr_32 = S_MOV_B32 97
+ %3098:sgpr_32 = S_MOV_B32 98
+ %3099:sgpr_32 = S_MOV_B32 99
+
+
+ %8000:vgpr_32 = IMPLICIT_DEF
+ %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+ $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+
+ %8001:vgpr_32 = COPY %8000
+ %8002:vgpr_32 = COPY %8000
+ %8003:vgpr_32 = COPY %8000
+ %8004:vgpr_32 = COPY %8000
+ %8005:vgpr_32 = COPY %8000
+ %8006:vgpr_32 = COPY %8000
+ %8007:vgpr_32 = COPY %8000
+ %8008:vgpr_32 = COPY %8000
+ %8009:vgpr_32 = COPY %8000
+ %8010:vgpr_32 = COPY %8000
+ %8011:vgpr_32 = COPY %8000
+ %8012:vgpr_32 = COPY %8000
+ %8013:vgpr_32 = COPY %8000
+ %8014:vgpr_32 = COPY %8000
+ %8015:vgpr_32 = COPY %8000
+ %8016:vgpr_32 = COPY %8000
+ %8017:vgpr_32 = COPY %8000
+
+ %9001:vgpr_32 = COPY %8001
+ %9002:vgpr_32 = COPY %8002
+ %9003:vgpr_32 = COPY %8003
+ %9004:vgpr_32 = COPY %8004
+ %9005:vgpr_32 = COPY %8005
+ %9006:vgpr_32 = COPY %8006
+ %9007:vgpr_32 = COPY %8007
+ %9008:vgpr_32 = COPY %8008
+ %9009:vgpr_32 = COPY %8009
+ %9010:vgpr_32 = COPY %8010
+ %9011:vgpr_32 = COPY %8011
+ %9012:vgpr_32 = COPY %8012
+ %9013:vgpr_32 = COPY %8013
+ %9014:vgpr_32 = COPY %8014
+ %9015:vgpr_32 = COPY %8015
+ %9016:vgpr_32 = COPY %8016
+ %9017:vgpr_32 = COPY %8017
+
+ S_BRANCH %bb.2
+
+ bb.2:
+ %5000:sgpr_32 = PHI %3000, %bb.0, %8001, %bb.1
+ %5001:sgpr_32 = PHI %3001, %bb.0, %8001, %bb.1
+ %5002:sgpr_32 = PHI %3002, %bb.0, %8001, %bb.1
+ %5003:sgpr_32 = PHI %3003, %bb.0, %8001, %bb.1
+ %5004:sgpr_32 = PHI %3004, %bb.0, %8001, %bb.1
+ %5005:sgpr_32 = PHI %3005, %bb.0, %8001, %bb.1
+ %5006:sgpr_32 = PHI %3006, %bb.0, %8001, %bb.1
+ %5007:sgpr_32 = PHI %3007, %bb.0, %8001, %bb.1
+ %5008:sgpr_32 = PHI %3008, %bb.0, %8001, %bb.1
+ %5009:sgpr_32 = PHI %3009, %bb.0, %8001, %bb.1
+ %5010:sgpr_32 = PHI %3010, %bb.0, %8001, %bb.1
+ %5011:sgpr_32 = PHI %3011, %bb.0, %8001, %bb.1
+ %5012:sgpr_32 = PHI %3012, %bb.0, %8001, %bb.1
+ %5013:sgpr_32 = PHI %3013, %bb.0, %8001, %bb.1
+ %5014:sgpr_32 = PHI %3014, %bb.0, %8001, %bb.1
+ %5015:sgpr_32 = PHI %3015, %bb.0, %8001, %bb.1
+ %5016:sgpr_32 = PHI %3016, %bb.0, %8001, %bb.1
+ %5017:sgpr_32 = PHI %3017, %bb.0, %8001, %bb.1
+ %5018:sgpr_32 = PHI %3018, %bb.0, %8001, %bb.1
+ %5019:sgpr_32 = PHI %3019, %bb.0, %8001, %bb.1
+ %5020:sgpr_32 = PHI %3020, %bb.0, %8001, %bb.1
+ %5021:sgpr_32 = PHI %3021, %bb.0, %8001, %bb.1
+ %5022:sgpr_32 = PHI %3022, %bb.0, %8001, %bb.1
+ %5023:sgpr_32 = PHI %3023, %bb.0, %8001, %bb.1
+ %5024:sgpr_32 = PHI %3024, %bb.0, %8001, %bb.1
+ %5025:sgpr_32 = PHI %3025, %bb.0, %8001, %bb.1
+ %5026:sgpr_32 = PHI %3026, %bb.0, %8001, %bb.1
+ %5027:sgpr_32 = PHI %3027, %bb.0, %8001, %bb.1
+ %5028:sgpr_32 = PHI %3028, %bb.0, %8001, %bb.1
+ %5029:sgpr_32 = PHI %3029, %bb.0, %8001, %bb.1
+ %5030:sgpr_32 = PHI %3030, %bb.0, %8001, %bb.1
+ %5031:sgpr_32 = PHI %3031, %bb.0, %8001, %bb.1
+ %5032:sgpr_32 = PHI %3032, %bb.0, %8001, %bb.1
+ %5033:sgpr_32 = PHI %3033, %bb.0, %8001, %bb.1
+ %5034:sgpr_32 = PHI %3034, %bb.0, %8001, %bb.1
+ %5035:sgpr_32 = PHI %3035, %bb.0, %8001, %bb.1
+ %5036:sgpr_32 = PHI %3036, %bb.0, %8001, %bb.1
+ %5037:sgpr_32 = PHI %3037, %bb.0, %8001, %bb.1
+ %5038:sgpr_32 = PHI %3038, %bb.0, %8001, %bb.1
+ %5039:sgpr_32 = PHI %3039, %bb.0, %8001, %bb.1
+ %5040:sgpr_32 = PHI %3040, %bb.0, %8001, %bb.1
+ %5041:sgpr_32 = PHI %3041, %bb.0, %8001, %bb.1
+ %5042:sgpr_32 = PHI %3042, %bb.0, %8001, %bb.1
+ %5043:sgpr_32 = PHI %3043, %bb.0, %8001, %bb.1
+ %5044:sgpr_32 = PHI %3044, %bb.0, %8001, %bb.1
+ %5045:sgpr_32 = PHI %3045, %bb.0, %8001, %bb.1
+ %5046:sgpr_32 = PHI %3046, %bb.0, %8001, %bb.1
+ %5047:sgpr_32 = PHI %3047, %bb.0, %8001, %bb.1
+ %5048:sgpr_32 = PHI %3048, %bb.0, %8001, %bb.1
+ %5049:sgpr_32 = PHI %3049, %bb.0, %8001, %bb.1
+ %5050:sgpr_32 = PHI %3050, %bb.0, %8001, %bb.1
+ %5051:sgpr_32 = PHI %3051, %bb.0, %8001, %bb.1
+ %5052:sgpr_32 = PHI %3052, %bb.0, %8001, %bb.1
+ %5053:sgpr_32 = PHI %3053, %bb.0, %8001, %bb.1
+ %5054:sgpr_32 = PHI %3054, %bb.0, %8001, %bb.1
+ %5055:sgpr_32 = PHI %3055, %bb.0, %8001, %bb.1
+ %5056:sgpr_32 = PHI %3056, %bb.0, %8001, %bb.1
+ %5057:sgpr_32 = PHI %3057, %bb.0, %8001, %bb.1
+ %5058:sgpr_32 = PHI %3058, %bb.0, %8001, %bb.1
+ %5059:sgpr_32 = PHI %3059, %bb.0, %8001, %bb.1
+ %5060:sgpr_32 = PHI %3060, %bb.0, %8001, %bb.1
+ %5061:sgpr_32 = PHI %3061, %bb.0, %8001, %bb.1
+ %5062:sgpr_32 = PHI %3062, %bb.0, %8001, %bb.1
+ %5063:sgpr_32 = PHI %3063, %bb.0, %8001, %bb.1
+ %5064:sgpr_32 = PHI %3064, %bb.0, %8001, %bb.1
+ %5065:sgpr_32 = PHI %3065, %bb.0, %8001, %bb.1
+ %5066:sgpr_32 = PHI %3066, %bb.0, %8001, %bb.1
+ %5067:sgpr_32 = PHI %3067, %bb.0, %8001, %bb.1
+ %5068:sgpr_32 = PHI %3068, %bb.0, %8001, %bb.1
+ %5069:sgpr_32 = PHI %3069, %bb.0, %8001, %bb.1
+ %5070:sgpr_32 = PHI %3070, %bb.0, %8001, %bb.1
+ %5071:sgpr_32 = PHI %3071, %bb.0, %8001, %bb.1
+ %5072:sgpr_32 = PHI %3072, %bb.0, %8001, %bb.1
+ %5073:sgpr_32 = PHI %3073, %bb.0, %8001, %bb.1
+ %5074:sgpr_32 = PHI %3074, %bb.0, %8001, %bb.1
+ %5075:sgpr_32 = PHI %3075, %bb.0, %8001, %bb.1
+ %5076:sgpr_32 = PHI %3076, %bb.0, %8001, %bb.1
+ %5077:sgpr_32 = PHI %3077, %bb.0, %8001, %bb.1
+ %5078:sgpr_32 = PHI %3078, %bb.0, %8001, %bb.1
+ %5079:sgpr_32 = PHI %3079, %bb.0, %8001, %bb.1
+ %5080:sgpr_32 = PHI %3080, %bb.0, %8001, %bb.1
+ %5081:sgpr_32 = PHI %3081, %bb.0, %8001, %bb.1
+ %5082:sgpr_32 = PHI %3082, %bb.0, %8001, %bb.1
+ %5083:sgpr_32 = PHI %3083, %bb.0, %8001, %bb.1
+ %5084:sgpr_32 = PHI %3084, %bb.0, %8001, %bb.1
+ %5085:sgpr_32 = PHI %3085, %bb.0, %8001, %bb.1
+ %5086:sgpr_32 = PHI %3086, %bb.0, %8001, %bb.1
+ %5087:sgpr_32 = PHI %3087, %bb.0, %8001, %bb.1
+ %5088:sgpr_32 = PHI %3088, %bb.0, %8001, %bb.1
+ %5089:sgpr_32 = PHI %3089, %bb.0, %8001, %bb.1
+ %5090:sgpr_32 = PHI %3090, %bb.0, %8001, %bb.1
+ %5091:sgpr_32 = PHI %3091, %bb.0, %8001, %bb.1
+ %5092:sgpr_32 = PHI %3092, %bb.0, %8001, %bb.1
+ %5093:sgpr_32 = PHI %3093, %bb.0, %8001, %bb.1
+ %5094:sgpr_32 = PHI %3094, %bb.0, %8001, %bb.1
+ %5095:sgpr_32 = PHI %3095, %bb.0, %8001, %bb.1
+ %5096:sgpr_32 = PHI %3096, %bb.0, %8001, %bb.1
+ %5097:sgpr_32 = PHI %3097, %bb.0, %8001, %bb.1
+ %5098:sgpr_32 = PHI %3098, %bb.0, %8001, %bb.1
+ %5099:sgpr_32 = PHI %3099, %bb.0, %8001, %bb.1
+
+
+ %3:vgpr_32 = IMPLICIT_DEF
+
+ %6000:vgpr_32 = V_MOV_B32_e32 %5000, implicit $exec
+ %6001:vgpr_32 = V_MOV_B32_e32 %5001, implicit $exec
+ %6002:vgpr_32 = V_MOV_B32_e32 %5002, implicit $exec
+ %6003:vgpr_32 = V_MOV_B32_e32 %5003, implicit $exec
+ %6004:vgpr_32 = V_MOV_B32_e32 %5004, implicit $exec
+ %6005:vgpr_32 = V_MOV_B32_e32 %5005, implicit $exec
+ %6006:vgpr_32 = V_MOV_B32_e32 %5006, implicit $exec
+ %6007:vgpr_32 = V_MOV_B32_e32 %5007, implicit $exec
+ %6008:vgpr_32 = V_MOV_B32_e32 %5008, implicit $exec
+ %6009:vgpr_32 = V_MOV_B32_e32 %5009, implicit $exec
+ %6010:vgpr_32 = V_MOV_B32_e32 %5010, implicit $exec
+ %6011:vgpr_32 = V_MOV_B32_e32 %5011, implicit $exec
+ %6012:vgpr_32 = V_MOV_B32_e32 %5012, implicit $exec
+ %6013:vgpr_32 = V_MOV_B32_e32 %5013, implicit $exec
+ %6014:vgpr_32 = V_MOV_B32_e32 %5014, implicit $exec
+ %6015:vgpr_32 = V_MOV_B32_e32 %5015, implicit $exec
+ %6016:vgpr_32 = V_MOV_B32_e32 %5016, implicit $exec
+ %6017:vgpr_32 = V_MOV_B32_e32 %5017, implicit $exec
+ %6018:vgpr_32 = V_MOV_B32_e32 %5018, implicit $exec
+ %6019:vgpr_32 = V_MOV_B32_e32 %5019, implicit $exec
+ %6020:vgpr_32 = V_MOV_B32_e32 %5020, implicit $exec
+ %6021:vgpr_32 = V_MOV_B32_e32 %5021, implicit $exec
+ %6022:vgpr_32 = V_MOV_B32_e32 %5022, implicit $exec
+ %6023:vgpr_32 = V_MOV_B32_e32 %5023, implicit $exec
+ %6024:vgpr_32 = V_MOV_B32_e32 %5024, implicit $exec
+ %6025:vgpr_32 = V_MOV_B32_e32 %5025, implicit $exec
+ %6026:vgpr_32 = V_MOV_B32_e32 %5026, implicit $exec
+ %6027:vgpr_32 = V_MOV_B32_e32 %5027, implicit $exec
+ %6028:vgpr_32 = V_MOV_B32_e32 %5028, implicit $exec
+ %6029:vgpr_32 = V_MOV_B32_e32 %5029, implicit $exec
+ %6030:vgpr_32 = V_MOV_B32_e32 %5030, implicit $exec
+ %6031:vgpr_32 = V_MOV_B32_e32 %5031, implicit $exec
+ %6032:vgpr_32 = V_MOV_B32_e32 %5032, implicit $exec
+ %6033:vgpr_32 = V_MOV_B32_e32 %5033, implicit $exec
+ %6034:vgpr_32 = V_MOV_B32_e32 %5034, implicit $exec
+ %6035:vgpr_32 = V_MOV_B32_e32 %5035, implicit $exec
+ %6036:vgpr_32 = V_MOV_B32_e32 %5036, implicit $exec
+ %6037:vgpr_32 = V_MOV_B32_e32 %5037, implicit $exec
+ %6038:vgpr_32 = V_MOV_B32_e32 %5038, implicit $exec
+ %6039:vgpr_32 = V_MOV_B32_e32 %5039, implicit $exec
+ %6040:vgpr_32 = V_MOV_B32_e32 %5040, implicit $exec
+ %6041:vgpr_32 = V_MOV_B32_e32 %5041, implicit $exec
+ %6042:vgpr_32 = V_MOV_B32_e32 %5042, implicit $exec
+ %6043:vgpr_32 = V_MOV_B32_e32 %5043, implicit $exec
+ %6044:vgpr_32 = V_MOV_B32_e32 %5044, implicit $exec
+ %6045:vgpr_32 = V_MOV_B32_e32 %5045, implicit $exec
+ %6046:vgpr_32 = V_MOV_B32_e32 %5046, implicit $exec
+ %6047:vgpr_32 = V_MOV_B32_e32 %5047, implicit $exec
+ %6048:vgpr_32 = V_MOV_B32_e32 %5048, implicit $exec
+ %6049:vgpr_32 = V_MOV_B32_e32 %5049, implicit $exec
+ %6050:vgpr_32 = V_MOV_B32_e32 %5050, implicit $exec
+ %6051:vgpr_32 = V_MOV_B32_e32 %5051, implicit $exec
+ %6052:vgpr_32 = V_MOV_B32_e32 %5052, implicit $exec
+ %6053:vgpr_32 = V_MOV_B32_e32 %5053, implicit $exec
+ %6054:vgpr_32 = V_MOV_B32_e32 %5054, implicit $exec
+ %6055:vgpr_32 = V_MOV_B32_e32 %5055, implicit $exec
+ %6056:vgpr_32 = V_MOV_B32_e32 %5056, implicit $exec
+ %6057:vgpr_32 = V_MOV_B32_e32 %5057, implicit $exec
+ %6058:vgpr_32 = V_MOV_B32_e32 %5058, implicit $exec
+ %6059:vgpr_32 = V_MOV_B32_e32 %5059, implicit $exec
+ %6060:vgpr_32 = V_MOV_B32_e32 %5060, implicit $exec
+ %6061:vgpr_32 = V_MOV_B32_e32 %5061, implicit $exec
+ %6062:vgpr_32 = V_MOV_B32_e32 %5062, implicit $exec
+ %6063:vgpr_32 = V_MOV_B32_e32 %5063, implicit $exec
+ %6064:vgpr_32 = V_MOV_B32_e32 %5064, implicit $exec
+ %6065:vgpr_32 = V_MOV_B32_e32 %5065, implicit $exec
+ %6066:vgpr_32 = V_MOV_B32_e32 %5066, implicit $exec
+ %6067:vgpr_32 = V_MOV_B32_e32 %5067, implicit $exec
+ %6068:vgpr_32 = V_MOV_B32_e32 %5068, implicit $exec
+ %6069:vgpr_32 = V_MOV_B32_e32 %5069, implicit $exec
+ %6070:vgpr_32 = V_MOV_B32_e32 %5070, implicit $exec
+ %6071:vgpr_32 = V_MOV_B32_e32 %5071, implicit $exec
+ %6072:vgpr_32 = V_MOV_B32_e32 %5072, implicit $exec
+ %6073:vgpr_32 = V_MOV_B32_e32 %5073, implicit $exec
+ %6074:vgpr_32 = V_MOV_B32_e32 %5074, implicit $exec
+ %6075:vgpr_32 = V_MOV_B32_e32 %5075, implicit $exec
+ %6076:vgpr_32 = V_MOV_B32_e32 %5076, implicit $exec
+ %6077:vgpr_32 = V_MOV_B32_e32 %5077, implicit $exec
+ %6078:vgpr_32 = V_MOV_B32_e32 %5078, implicit $exec
+ %6079:vgpr_32 = V_MOV_B32_e32 %5079, implicit $exec
+ %6080:vgpr_32 = V_MOV_B32_e32 %5080, implicit $exec
+ %6081:vgpr_32 = V_MOV_B32_e32 %5081, implicit $exec
+ %6082:vgpr_32 = V_MOV_B32_e32 %5082, implicit $exec
+ %6083:vgpr_32 = V_MOV_B32_e32 %5083, implicit $exec
+ %6084:vgpr_32 = V_MOV_B32_e32 %5084, implicit $exec
+ %6085:vgpr_32 = V_MOV_B32_e32 %5085, implicit $exec
+ %6086:vgpr_32 = V_MOV_B32_e32 %5086, implicit $exec
+ %6087:vgpr_32 = V_MOV_B32_e32 %5087, implicit $exec
+ %6088:vgpr_32 = V_MOV_B32_e32 %5088, implicit $exec
+ %6089:vgpr_32 = V_MOV_B32_e32 %5089, implicit $exec
+ %6090:vgpr_32 = V_MOV_B32_e32 %5090, implicit $exec
+ %6091:vgpr_32 = V_MOV_B32_e32 %5091, implicit $exec
+ %6092:vgpr_32 = V_MOV_B32_e32 %5092, implicit $exec
+ %6093:vgpr_32 = V_MOV_B32_e32 %5093, implicit $exec
+ %6094:vgpr_32 = V_MOV_B32_e32 %5094, implicit $exec
+ %6095:vgpr_32 = V_MOV_B32_e32 %5095, implicit $exec
+ %6096:vgpr_32 = V_MOV_B32_e32 %5096, implicit $exec
+ %6097:vgpr_32 = V_MOV_B32_e32 %5097, implicit $exec
+ %6098:vgpr_32 = V_MOV_B32_e32 %5098, implicit $exec
+ %6099:vgpr_32 = V_MOV_B32_e32 %5099, implicit $exec
+ EXP 0, %6000, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6001, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6002, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6003, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6004, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6005, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6006, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6007, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6008, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6009, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6010, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6011, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6012, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6013, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6014, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6015, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6016, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6017, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6018, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6019, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6020, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6021, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6022, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6023, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6024, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6025, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6026, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6027, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6028, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6029, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6030, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6031, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6032, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6033, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6034, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6035, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6036, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6037, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6038, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6039, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6040, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6041, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6042, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6043, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6044, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6045, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6046, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6047, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6048, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6049, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6050, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6051, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6052, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6053, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6054, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6055, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6056, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6057, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6058, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6059, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6060, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6061, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6062, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6063, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6064, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6065, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6066, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6067, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6068, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6069, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6070, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6071, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6072, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6073, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6074, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6075, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6076, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6077, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6078, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6079, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6080, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6081, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6082, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6083, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6084, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6085, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6086, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6087, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6088, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6089, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6090, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6091, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6092, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6093, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6094, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6095, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6096, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6097, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6098, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, %6099, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+ S_ENDPGM 0
+...
+
>From d4fd382d1a23303d1804c3169a589f2aa55a58b4 Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Mon, 21 Apr 2025 15:59:36 -0700
Subject: [PATCH 5/6] clang format
---
.../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 69 +++++++++----------
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 14 ++--
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 6 +-
.../AMDGPUOccupancyAndLatencyHelper.cpp | 5 +-
.../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 4 +-
5 files changed, 46 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 5c628a89766c3..3c5d592602c6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -12,20 +12,20 @@
//
//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
#include "AMDGPUMIRUtils.h"
#include "AMDGPUOccupancyAndLatencyHelper.h"
-#include "AMDGPU.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/SlotIndexes.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "GCNRegPressure.h"
#define DEBUG_TYPE "amdgpu-hot-block-remat"
@@ -111,19 +111,18 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
- void applyCloneRemat(RematNode &Node,
- std::vector<BlockLiveInfo> &HotBlocks,
- MachineDominatorTree *DT, MachineRegisterInfo &MRI,
- SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
- const SIInstrInfo *SIII, MachineFunction &MF);
+ void applyCloneRemat(RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF);
void applyRemat(MapVector<Register, RematNode> &RematMap,
- std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
- llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
- const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
- MachineFunction &MF);
+ std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF);
bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
- LiveIntervals *LIS, MachineDominatorTree *DT,
- MachinePostDominatorTree *PDT, bool &IsNearTarget);
+ LiveIntervals *LIS, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, bool &IsNearTarget);
StringRef getPassName() const override { return "AMDGPU rematerialize"; }
@@ -237,11 +236,11 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
}
}
-void AMDGPUHotBlockRematerialize::applyCloneRemat(RematNode &Node,
- std::vector<BlockLiveInfo> &HotBlocks,
- MachineDominatorTree *DT, MachineRegisterInfo &MRI,
- SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
- const SIInstrInfo *SIII, MachineFunction &MF) {
+void AMDGPUHotBlockRematerialize::applyCloneRemat(
+ RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF) {
unsigned Reg = Node.Reg;
MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
@@ -359,11 +358,11 @@ void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
SlotIndexes->insertMachineInstrInMaps(*DefMI);
}
-void AMDGPUHotBlockRematerialize::applyRemat(MapVector<Register, RematNode> &RematMap,
- std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
- llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
- const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
- MachineFunction &MF) {
+void AMDGPUHotBlockRematerialize::applyRemat(
+ MapVector<Register, RematNode> &RematMap,
+ std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+ llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) {
std::vector<RematNode> UpdateList;
for (auto &It : RematMap) {
UpdateList.emplace_back(It.second);
@@ -381,8 +380,7 @@ void AMDGPUHotBlockRematerialize::applyRemat(MapVector<Register, RematNode> &Rem
if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
} else if (Node.Kind == RematNode::RematKind::Clone) {
- applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII,
- MF);
+ applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF);
}
}
}
@@ -1234,9 +1232,12 @@ void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
dbgs() << "Total Size:" << TotalSize << "\n";
}
-bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
- LiveIntervals *LIS, MachineDominatorTree *DT,
- MachinePostDominatorTree *PDT, bool &IsNearTarget) {
+bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
+ MachineLoopInfo *MLI,
+ LiveIntervals *LIS,
+ MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT,
+ bool &IsNearTarget) {
const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *SIII = ST->getInstrInfo();
@@ -1489,8 +1490,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop
if (!SRematMap.empty()) {
IsUpdated = true;
- applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII,
- MF);
+ applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, MF);
LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
}
@@ -1530,4 +1530,3 @@ char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
return new AMDGPUHotBlockRematerialize();
}
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 6d6bd38c61c06..dfb90e5545c8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -13,13 +13,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMIRUtils.h"
-#include "SIRegisterInfo.h"
#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#define DEBUG_TYPE "xb-mir-util"
using namespace llvm;
@@ -101,11 +101,10 @@ bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
} // namespace
-
namespace llvm {
bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
- llvm::MachineBasicBlock::iterator MI) {
+ llvm::MachineBasicBlock::iterator MI) {
const TargetRegisterInfo *TRI =
MBB->getParent()->getRegInfo().getTargetRegisterInfo();
for (auto It = MI; It != MBB->end(); ++It) {
@@ -205,9 +204,8 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
// TouchedMBBSet is used for scheduling where local live interval could cross
// multiple regions, need to calculate livereg for each region inside touched
// MBB.
-bool isLocalLiveInterval(
- const LiveInterval &LI, SlotIndexes *Indexes,
- SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
if (LI.hasSubRanges()) {
for (const auto &S : LI.subranges()) {
if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 6b9079e5d65fb..2470e2bed482f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -86,8 +86,8 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
llvm::SlotIndexes *SlotIndexes);
unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
- const llvm::MachineRegisterInfo &MRI,
- const llvm::SIRegisterInfo *SIRI);
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI);
void collectLiveSetPressure(const LiveSet &LiveSet,
const llvm::MachineRegisterInfo &MRI,
const llvm::SIRegisterInfo *SIRI,
@@ -97,6 +97,6 @@ bool reach_block(llvm::MachineBasicBlock *FromBB,
llvm::MachineDominatorTree *DT,
llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
llvm::MachineBasicBlock *ToBB);
-}
+} // namespace llvm
#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index c2dbf1a8b297e..5c2b7904c46be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -16,8 +16,8 @@
#include "GCNSubtarget.h"
#include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include <cmath>
@@ -144,7 +144,6 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
}
}
-
SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
const llvm::MachineLoopInfo *MLI) {
SchedScore TotalScore;
@@ -165,5 +164,3 @@ SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
}
} // namespace llvm
-
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index b513e7335ffe4..e30df0d457863 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -15,8 +15,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
-#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInstrItineraries.h"
namespace llvm {
@@ -76,5 +76,5 @@ SchedScore collectLatency(llvm::MachineFunction &MF,
const llvm::GCNSubtarget &ST,
const llvm::MachineLoopInfo *MLI = nullptr);
-}
+} // namespace llvm
#endif
>From 4f7d0dad93c64d94667e74dbd80fdabed3146144 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Tue, 22 Apr 2025 11:54:29 -0700
Subject: [PATCH 6/6] LLVM Style
---
.../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 87 +++++++------------
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 59 +++++--------
.../AMDGPUOccupancyAndLatencyHelper.cpp | 16 ++--
.../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 5 --
4 files changed, 60 insertions(+), 107 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 3c5d592602c6f..e165b83b18850 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -176,21 +176,17 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
bool IsDomAllHotBlocks = true;
bool IsDomedByAllHotBlocks = true;
for (MachineBasicBlock *HotMBB : HotBlockSet) {
- if (!DT->dominates(MBB, HotMBB)) {
+ if (!DT->dominates(MBB, HotMBB))
IsDomAllHotBlocks = false;
- }
- if (!DT->dominates(HotMBB, MBB)) {
+ if (!DT->dominates(HotMBB, MBB))
IsDomedByAllHotBlocks = false;
- }
- if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
+ if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks)
break;
- }
}
- if (IsDomAllHotBlocks) {
+ if (IsDomAllHotBlocks)
UserBlocks.erase(MBB);
- } else if (IsDomedByAllHotBlocks) {
+ else if (IsDomedByAllHotBlocks)
AfterHotRangeMBBs.insert(MBB);
- }
}
// Split after hotRange block set by domtree.
@@ -274,18 +270,16 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat(
for (auto UseIt : UserMap) {
MachineBasicBlock *MBB = UseIt.first;
// Skip same block uses.
- if (MBB == DefMI->getParent()) {
+ if (MBB == DefMI->getParent())
continue;
- }
// Skip MBB which share clone from other MBBs.
if (UserMBBSet.count(MBB) == 0)
continue;
Register NewReg = MRI.createVirtualRegister(RC);
auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
- for (unsigned I = 1; I < OpNum; I++) {
+ for (unsigned I = 1; I < OpNum; I++)
NewDef = NewDef.add(DefMI->getOperand(I));
- }
MachineInstr *InsertPointMI = UseIt.second.front();
SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
@@ -364,9 +358,9 @@ void AMDGPUHotBlockRematerialize::applyRemat(
llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) {
std::vector<RematNode> UpdateList;
- for (auto &It : RematMap) {
+ for (auto &It : RematMap)
UpdateList.emplace_back(It.second);
- }
+
// Sort update list with slotIndex to make sure def moved before use.
// If use moved before def, It might not be the first use anymore.
std::sort(UpdateList.begin(), UpdateList.end(),
@@ -377,11 +371,10 @@ void AMDGPUHotBlockRematerialize::applyRemat(
});
for (RematNode &Node : UpdateList) {
- if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+ if (Node.Kind == RematNode::RematKind::OneDefOneUse)
applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
- } else if (Node.Kind == RematNode::RematKind::Clone) {
+ else if (Node.Kind == RematNode::RematKind::Clone)
applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF);
- }
}
}
@@ -410,12 +403,10 @@ unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
unsigned SPressure = RP.getMaxSGPR();
- if (SPressure > MaxSPressure) {
+ if (SPressure > MaxSPressure)
MaxSPressure = SPressure;
- }
- if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) {
+ if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure)
MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
- }
Status.MBBPressureMap[&MBB] = RP;
return RP.getOccupancy(*ST);
}
@@ -573,9 +564,8 @@ RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
unsigned SInputPressure = 0;
uint64_t Mask = 0xf;
while (Mask != 0) {
- if (Mask & SInputMask) {
+ if (Mask & SInputMask)
SInputPressure += 4;
- }
Mask = Mask << 4;
}
@@ -670,9 +660,8 @@ void updateLiveInfo(MapVector<Register, RematNode> &RematMap,
// still before LiveInfo.BB, It is still live.
unsigned LiveBBIndex = RPOTIndexMap[CurBB];
unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
- if (LiveBBIndex > InsertBBIndex) {
+ if (LiveBBIndex > InsertBBIndex)
continue;
- }
}
// Already in remat map, don't need to check again, remove from
// candidate.
@@ -978,11 +967,10 @@ void buildRematCandiates(std::vector<RematNode> &Candidates,
if (IsSafeCandidate) {
int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR);
- if (Gain > 0) {
+ if (Gain > 0)
Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5));
- } else {
+ else
IsSafeCandidate = false;
- }
}
// Save unsafe reg.
if (!IsSafeCandidate)
@@ -1056,9 +1044,9 @@ int filterRematCandiates(std::vector<RematNode> &Candidates,
// Work one def one use first.
for (auto &Node : Candidates) {
unsigned Reg = Node.Reg;
- if (!MRI.hasOneNonDBGUse(Reg)) {
+ if (!MRI.hasOneNonDBGUse(Reg))
continue;
- }
+
MachineInstr *DefMI = Node.DefMI;
if (!isSafeToMove(DefMI, MRI)) {
PinnedRegSet.insert(Reg);
@@ -1074,9 +1062,9 @@ int filterRematCandiates(std::vector<RematNode> &Candidates,
// Try multi use case.
for (auto &Node : Candidates) {
unsigned Reg = Node.Reg;
- if (MRI.hasOneNonDBGUse(Reg)) {
+ if (MRI.hasOneNonDBGUse(Reg))
continue;
- }
+
MachineInstr *DefMI = Node.DefMI;
if (!isSafeToMove(DefMI, MRI)) {
PinnedRegSet.insert(Reg);
@@ -1161,10 +1149,9 @@ int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR,
if (!Reg.isVirtual())
continue;
- if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) {
+ if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg()))
// Not support mix of v and s when remat now.
continue;
- }
const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
@@ -1245,9 +1232,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
DenseMap<MachineBasicBlock *, unsigned> RPOTIndexMap;
- for (MachineBasicBlock *MBB : RPOT) {
+ for (MachineBasicBlock *MBB : RPOT)
RPOTIndexMap[MBB] = RPOTIndexMap.size();
- }
auto &MRI = MF.getRegInfo();
@@ -1267,9 +1253,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
RematSCnt += NearTargetRegLimit;
bool IsSGPRSpill = false;
- if (RematSCnt > 0) {
+ if (RematSCnt > 0)
IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
- }
const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
@@ -1354,9 +1339,9 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
int RematSCnt = MaxSPressure - SReduced - SLimit;
bool IsSGPRSpill = false;
- if (RematSCnt > 0) {
+ if (RematSCnt > 0)
IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF);
- }
+
bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
// Try to add candidates into remat list.
@@ -1393,15 +1378,13 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI);
if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
RematSCnt) {
- for (RematNode &Node : SRematList) {
+ for (RematNode &Node : SRematList)
SRematMap[Node.Reg] = Node;
- }
} else {
if (!IsForceRematSgpr)
return false;
- for (RematNode &Node : SRematList) {
+ for (RematNode &Node : SRematList)
SRematMap[Node.Reg] = Node;
- }
// Find local one def one use candidates.
for (MachineInstr &MI : *MBB) {
if (MI.isDebugInstr())
@@ -1425,9 +1408,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
/*IsVGPR*/ false);
if (Gain > 0) {
// Skip case when DefMI has implicit define which used by UseMI.
- if (isImplicitDefUse(&MI, &UseMI)) {
+ if (isImplicitDefUse(&MI, &UseMI))
continue;
- }
RematNode Node = {Reg, &MI, (unsigned)Gain >> 5};
Node.InsertPointMI = &UseMI;
Node.Kind = RematNode::RematKind::OneDefOneUse;
@@ -1459,19 +1441,16 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
bool IsVRematOK =
(Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty();
if (NeedSRemat && NeedVRemat) {
- if (IsVRematOK && IsSRematOK) {
+ if (IsVRematOK && IsSRematOK)
IsUpdated = true;
- } else if (IsSGPRSpill) {
+ else if (IsSGPRSpill)
IsUpdated = true;
- }
} else if (NeedSRemat) {
- if (IsSRematOK) {
+ if (IsSRematOK)
IsUpdated = true;
- }
} else if (NeedVRemat) {
- if (IsVRematOK) {
+ if (IsVRematOK)
IsUpdated = true;
- }
}
// TODO: what to do when cannot reach target?
if (NewRematSCnt > 0) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index dfb90e5545c8e..afa1a8853938f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -140,9 +140,8 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
// If SCC is dead at MI when we can use MI as the insert point.
- if (!llvm::isSccLiveAt(MBB, MI)) {
+ if (!llvm::isSccLiveAt(MBB, MI))
return MI;
- }
const bool CheckForExecWrite =
Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
@@ -150,11 +149,10 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
// Get the starting reverse iterator taking care to handle the MBB->end()
// case.
MachineBasicBlock::reverse_iterator Start;
- if (MI == MBB->end()) {
+ if (MI == MBB->end())
Start = MBB->rbegin();
- } else {
+ else
Start = MI.getReverse();
- }
// Otherwise, walk backwards through the block looking for a location where
// SCC is dead.
@@ -164,14 +162,12 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
// an insertion point (if that is a constraint from the caller).
// The check for EXEC works for both wave64 and wave32 because
// it will also catch Writes to the subregisters (e.g. exec_lo).
- if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) {
+ if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
break;
- }
if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
- !It->readsRegister(AMDGPU::SCC, TRI)) {
+ !It->readsRegister(AMDGPU::SCC, TRI))
return It->getIterator();
- }
}
// If no safe location can be found in the block we can save and restore
@@ -207,20 +203,18 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes,
SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
if (LI.hasSubRanges()) {
- for (const auto &S : LI.subranges()) {
+ for (const auto &S : LI.subranges())
if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
return false;
- }
}
return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
}
bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
if (LI.hasSubRanges()) {
- for (const auto &S : LI.subranges()) {
+ for (const auto &S : LI.subranges())
if (!isLocalLiveRange(&S, Indexes))
return false;
- }
}
return isLocalLiveRange(&LI, Indexes);
}
@@ -231,9 +225,8 @@ void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
for (auto It : LiveSet) {
int Reg = It.first;
dbgs() << printReg(Reg, SIRI);
- if (It.second.any()) {
+ if (It.second.any())
dbgs() << " mask:" << It.second.getAsInteger();
- }
dbgs() << "\n";
}
}
@@ -405,15 +398,13 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
MachineOperand &DstMO = MI.getOperand(0);
// Skip case when dst subReg not 0.
- if (DstMO.getSubReg()) {
+ if (DstMO.getSubReg())
return false;
- }
Register Reg = DstMO.getReg();
SmallVector<MachineOperand *, 2> UseMOs;
- for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) {
+ for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg))
UseMOs.emplace_back(&UseMO);
- }
const llvm::TargetRegisterClass *NewRC =
SIRI->getRegClass(Desc.operands().front().RegClass);
@@ -441,9 +432,8 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
assert(OffsetOp != nullptr);
int64_t Offset = OffsetOp->getImm();
Offset += Offset * LaneSize;
- if (!SIII->isLegalMUBUFImmOffset(Offset)) {
+ if (!SIII->isLegalMUBUFImmOffset(Offset))
return false;
- }
OffsetOp->setImm(Offset);
} else {
return false;
@@ -473,14 +463,12 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
}
}
// Update subReg for users.
- for (MachineOperand *UseMO : UseMOs) {
+ for (MachineOperand *UseMO : UseMOs)
updateSubReg(*UseMO, NewRC, Offset, SIRI);
- }
} else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) {
// Clear subReg when it's a single 32-bit reg.
- for (MachineOperand *UseMO : UseMOs) {
+ for (MachineOperand *UseMO : UseMOs)
UseMO->setSubReg(0);
- }
}
MI.setDesc(Desc);
@@ -511,9 +499,8 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
return false;
LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI);
LaneBitmask UseMask;
- for (MachineOperand &MO : MRI.use_operands(Reg)) {
+ for (MachineOperand &MO : MRI.use_operands(Reg))
UseMask |= llvm::getRegMask(MO, MRI);
- }
const unsigned FullMask = DstMask.getAsInteger();
unsigned Mask = UseMask.getAsInteger();
@@ -602,11 +589,10 @@ void collectLiveSetPressure(const LiveSet &LiveSet,
for (auto LiveIt : LiveSet) {
unsigned Reg = LiveIt.first;
unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI);
- if (SIRI->isVGPR(MRI, Reg)) {
+ if (SIRI->isVGPR(MRI, Reg))
VPressure += Size;
- } else {
+ else
SPressure += Size;
- }
}
}
@@ -651,21 +637,18 @@ bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
MachineBasicBlock *ToBB) {
- if (FromBB == ToBB) {
+ if (FromBB == ToBB)
return true;
- }
- if (DT->dominates(FromBB, ToBB)) {
+ if (DT->dominates(FromBB, ToBB))
return true;
- }
- if (PDT->dominates(ToBB, FromBB)) {
+ if (PDT->dominates(ToBB, FromBB))
return true;
- }
- if (loopContainsBoth(LI, ToBB, FromBB)) {
+ if (loopContainsBoth(LI, ToBB, FromBB))
return true;
- }
+
// TODO: cover case hotBB in loop,
// one block in that loop dom BB or
// BB post dom one block in that loop.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index 5c2b7904c46be..6160fe5471376 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -101,11 +101,10 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
auto GetAluStatus = [](const MachineInstr &MI,
const llvm::SIInstrInfo *SIII) {
AluStatus Status = AluStatus::Nothing;
- if (SIII->isVALU(MI.getOpcode())) {
+ if (SIII->isVALU(MI.getOpcode()))
Status = AluStatus::Vector;
- } else if (SIII->isSALU(MI.getOpcode())) {
+ else if (SIII->isSALU(MI.getOpcode()))
Status = AluStatus::Scalar;
- }
return Status;
};
AluStatus Status = GetAluStatus(MI, SIII);
@@ -120,11 +119,10 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
case AluStatus::Scalar: {
Score.Alu += Latency;
// Ignore mix alu.
- if (PrevStatus != Status) {
+ if (PrevStatus != Status)
PrevStatus = AluStatus::Nothing;
- } else {
+ else
Score.MixAlu += Latency;
- }
} break;
}
}
@@ -151,13 +149,11 @@ SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
MachineBasicBlock &MBB = MFI;
MachineBasicBlock::iterator Next;
AMDGPULatencyTracker LatencyTracker(ST);
- for (auto &MI : MBB) {
+ for (auto &MI : MBB)
LatencyTracker.scan(MI);
- }
unsigned LoopDepth = 0;
- if (MLI) {
+ if (MLI)
LoopDepth = MLI->getLoopDepth(&MBB);
- }
TotalScore.sum(LatencyTracker.Score, LoopDepth);
}
return TotalScore;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index e30df0d457863..9c63fa7e6b4a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -39,12 +39,7 @@ struct SchedScore {
unsigned Lds = 0; // Todo: count lds.
SchedScore() {}
- // Other info which can help compare schedule result.
- float computeScore() const;
- float computeScore2() const;
-
void sum(const SchedScore &S, unsigned LoopDepth = 0);
- bool isBetter(const SchedScore &S) const;
bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
// More latency can be hiden with ExtraOcc.
unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
More information about the llvm-commits
mailing list