[llvm] [AMDGPU] Added hot-block-rematerialize pass (PR #126331)
Adam Yang via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 7 17:27:49 PST 2025
https://github.com/adam-yang created https://github.com/llvm/llvm-project/pull/126331
None
>From 095dd0ab0c125215781256ac97e1ea790807e222 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Mon, 3 Feb 2025 14:48:43 -0800
Subject: [PATCH 1/3] Added rematerialize pass and test.
---
.../include/llvm/CodeGen/TargetRegisterInfo.h | 8 +
llvm/lib/CodeGen/TargetRegisterInfo.cpp | 91 +
llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +
.../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 4665 +++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 2241 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 217 +
.../AMDGPU/AMDGPUMirDivergenceAnalysis.cpp | 2767 ++++++++++
.../AMDGPU/AMDGPUMirDivergenceAnalysis.h | 281 +
.../AMDGPUMirSyncDependenceAnalysis.cpp | 511 ++
.../AMDGPU/AMDGPUMirSyncDependenceAnalysis.h | 98 +
.../AMDGPUOccupancyAndLatencyHelper.cpp | 188 +
.../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 74 +
llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 1790 +++++++
llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h | 197 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h | 106 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 6 +
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 4 +
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 +
.../CodeGen/AMDGPU/remat/vector_to_scalar.mir | 405 ++
20 files changed, 13657 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
create mode 100644 llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 114149ff53d850b..4a4d7756ae9ac71 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -430,6 +430,14 @@ class TargetRegisterInfo : public MCRegisterInfo {
LaneBitmask LaneMask,
SmallVectorImpl<unsigned> &Indexes) const;
+ /// Return the set of sub register indexes that minimally cover the given
+ /// lane mask for the given register class.
+ ///
+ /// \returns an empty set if there is no set of covering sub registers.
+ std::vector<unsigned>
+ getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterClass *RC,
+ LaneBitmask mask) const;
+
/// The lane masks returned by getSubRegIndexLaneMask() above can only be
/// used to determine if sub-registers overlap - they can't be used to
/// determine if a set of sub-registers completely cover another
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 77a4c74f1b38b9d..d37796a82899a1a 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -719,3 +719,94 @@ void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
dbgs() << printReg(Reg, TRI, SubRegIndex) << "\n";
}
#endif
+
+std::vector<unsigned>
+TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
+ const TargetRegisterClass *RC, LaneBitmask mask) const {
+ // TODO: this could replace the code it was copied from in SplitKit.cpp
+
+ // First pass: Try to find a perfectly matching subregister index.
+ // If none exists find the one covering the most lanemask bits.
+ SmallVector<unsigned, 8> PossibleIndexes;
+ unsigned BestIdx = 0;
+ const LaneBitmask avoid = ~mask;
+ {
+ unsigned BestCover = 0;
+ for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
+ // Is this index even compatible with the given class?
+ if (getSubClassWithSubReg(RC, Idx) != RC)
+ continue;
+ LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
+ // Early exit if we found a perfect match.
+ if (SubRegMask == mask) {
+ BestIdx = Idx;
+ break;
+ }
+
+ // The index must not cover any lanes outside
+ if ((SubRegMask & avoid).any())
+ continue;
+
+ unsigned PopCount = SubRegMask.getNumLanes();
+ PossibleIndexes.push_back(Idx);
+ if (PopCount > BestCover) {
+ BestCover = PopCount;
+ BestIdx = Idx;
+ }
+ }
+ }
+
+ // Abort if we cannot possibly implement the COPY with the given indexes.
+ if (BestIdx == 0) {
+ LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+ << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+ << '\n');
+ assert(false && "Impossible to span reg class");
+ return std::vector<unsigned>();
+ }
+
+ std::vector<unsigned> result;
+ result.push_back(BestIdx);
+
+ // Greedy heuristic: Keep iterating keeping the best covering subreg index
+ // each time.
+ mask &= ~(getSubRegIndexLaneMask(BestIdx));
+ while (mask.any()) {
+ BestIdx = 0;
+ int BestCover = std::numeric_limits<int>::min();
+ for (unsigned Idx : PossibleIndexes) {
+ LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
+ // Early exit if we found a perfect match.
+ if (SubRegMask == mask) {
+ BestIdx = Idx;
+ break;
+ }
+
+ // Guaranteed above
+ assert((SubRegMask & avoid).none());
+
+ // Try to cover as much of the remaining lanes as possible but as few of
+ // the already covered lanes as possible.
+ int Cover = (SubRegMask & mask).getNumLanes() -
+ (SubRegMask & ~mask).getNumLanes();
+ if (Cover > BestCover) {
+ BestCover = Cover;
+ BestIdx = Idx;
+ }
+ }
+
+ if (BestIdx == 0) {
+ LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+ << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+ << '\n');
+ assert(false && "Impossible to span reg class");
+ return std::vector<unsigned>();
+ }
+
+ result.push_back(BestIdx);
+ mask &= ~getSubRegIndexLaneMask(BestIdx);
+ }
+
+ return result;
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 31656c98ccd36fa..0f5b4f2277d1a8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -466,6 +466,10 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &);
+FunctionPass *createAMDGPUHotBlockRematerializePass();
+extern char &AMDGPUHotBlockRematerializeID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
new file mode 100644
index 000000000000000..44ebaa2d51bec19
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -0,0 +1,4665 @@
+//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block Rematerialize-------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU hot block Rematerialize
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUMirDivergenceAnalysis.h"
+#include "AMDGPUSubExpDag.h"
+#include "AMDGPUVMemDegreeDAG.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPUMIRUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+#include <unordered_set>
+#define DEBUG_TYPE "amdgpu-hot-block-remat"
+
+using namespace llvm;
+
+static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
+static cl::opt<bool> EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+static cl::opt<bool> EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
+static cl::opt<bool> EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
+static cl::opt<bool> EnableVmemDegree("amdgpu-remat-enable-vmem-degree");
+static cl::opt<bool> EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat");
+static cl::opt<bool> EnableSubExp("amdgpu-remat-enable-sub-exp-remat");
+static cl::opt<bool> EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
+static cl::opt<bool> EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
+
+namespace {
+typedef DenseSet<MachineInstr *> InstSet;
+typedef DenseSet<MachineBasicBlock *> BlockSet;
+template<typename T>
+using BlockMap = MapVector<MachineBasicBlock *, T>;
+
+// Rematerialize in a single pass instead of doing in register allcation.
+// If in register allocation, fail to rematerialize will cause spill.
+class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
+
+public:
+ static char ID;
+
+ DenseSet<const MachineInstr*> TotalUniformInsts;
+ DenseSet<const MachineInstr*> SafeToRemoveInsts;
+ DenseSet<const MachineInstr*> DivergentInsts;
+ void RemoveInst(const MachineInstr *MI) {
+ TotalUniformInsts.erase(MI);
+ SafeToRemoveInsts.erase(MI);
+ DivergentInsts.erase(MI);
+ }
+
+ AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "AMDGPU rematerialize"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+ AU.addRequired<SlotIndexesWrapperPass>();
+ AU.addRequired<LiveIntervalsWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+typedef AMDGPUHotBlockRematerialize Remat;
+
+} // end anonymous namespace
+
+// Util functions.
+namespace {
+
+MachineBasicBlock *
+nearest_common_dominator(MachineDominatorTree *DT,
+ BlockSet &Blocks) {
+ auto I = Blocks.begin(), E = Blocks.end();
+
+ MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
+ while (I != E) {
+ MachineBasicBlock *B = cast<MachineBasicBlock>(*(I++));
+ DomB = DT->findNearestCommonDominator(DomB, B);
+ if (DomB == nullptr)
+ return nullptr;
+ }
+ // For split block like:
+ // bb.42:
+ // %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec,
+ // // implicit $exec
+ // %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+ // implicitdef $scc, implicit $exec
+ //
+ // bb.68:
+ //; predecessors: %bb.42
+ // successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%),
+ // %bb.43(50.00%)
+ //
+ // SI_MASK_BRANCH %bb.43, implicit $exec
+ // S_BRANCH %bb.45
+ // which is from
+ // bb.42:
+ //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit
+ //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+ // SI_MASK_BRANCH %bb.43, implicit $exec
+ // S_BRANCH %bb.45
+ // The real common dom is bb.42.
+ // TODO: use _term version of exec update instructions so don't need this
+ // anymore.
+ if (DomB && DomB->pred_size() == 1 && !DomB->empty()) {
+ // Upstreaming note: This used to be SI_MASK_BRANCH
+ if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) {
+ MachineBasicBlock *Pred = *DomB->pred_begin();
+ if (Pred->succ_size() == 1 &&
+ (Pred->empty() || !Pred->back().isBranch())) {
+ DomB = Pred;
+ }
+ }
+ }
+
+ return DomB;
+}
+
+MachineBasicBlock *find_non_loop_dominator(MachineBasicBlock *BB,
+ MachineDominatorTree *DT,
+ MachineLoopInfo *LI) {
+ while (LI->getLoopDepth(BB) > 0) {
+ MachineDomTreeNode *N = DT->getNode(BB);
+ if (N == nullptr)
+ return nullptr;
+ MachineDomTreeNode *IDom = N->getIDom();
+ if (IDom == nullptr)
+ return nullptr;
+
+ BB = IDom->getBlock();
+ }
+
+ return BB;
+}
+
+MachineBasicBlock *
+FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+ const MachineRegisterInfo &MRI, bool bMemBound) {
+
+ BlockSet BBSet;
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ BBSet.insert(UseMI.getParent());
+ }
+ if (BBSet.size() == 0)
+ return nullptr;
+
+ MachineBasicBlock *BB = *BBSet.begin();
+ if (BBSet.size() > 1) {
+ MachineBasicBlock *BDom = nearest_common_dominator(DT, BBSet);
+ if (!BDom)
+ return nullptr;
+ BB = BDom;
+ }
+ // Try to find non loop dominator.
+ if (!bMemBound) {
+ BB = find_non_loop_dominator(BB, DT, MLI);
+ }
+ if (!BB)
+ return nullptr;
+
+ // If BB is already a hot block, move to BB will not help.
+ // hotBlockRemat will fail it when process BB.
+
+ // Must reachable from DefMI.
+ if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB))
+ return nullptr;
+
+ return BB;
+}
+
+bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+ unsigned OpNum = DefMI->getNumOperands();
+
+ // Only move DefMI which all operand is unique def.
+ for (unsigned i = 0; i < OpNum; i++) {
+ MachineOperand &Op = DefMI->getOperand(i);
+ if (!Op.isReg())
+ continue;
+ if (!MRI.getUniqueVRegDef(Op.getReg()) &&
+ !llvm::IsSub0Sub1SingleDef(Op.getReg(), MRI)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+
+// SGPR has alignment requirment, cannot get accurate reg number.
+const unsigned NearTargetRegLimit = 10;
+bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, MachineFunction &MF) {
+ unsigned maxSGPR = ST->getAddressableNumSGPRs();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+ if (ScratchRSrcReg)
+ maxSGPR -= 4;
+
+ const unsigned AlignmentDelta = 3;
+ maxSGPR -= AlignmentDelta;
+
+ return maxSPressure > maxSGPR;
+}
+
+struct RematStatus {
+ unsigned TargetOcc;
+ unsigned TargetVLimit;
+ unsigned TargetSLimit;
+ unsigned MaxVPressure;
+ unsigned MaxSPressure;
+ unsigned InputPhysicalVPressure;
+ unsigned InputPhysicalSPressure;
+ // More occupancy can help more than latency cost to reach it.
+ bool bMemBound;
+ // abs(VTargetOcc-STargetOcc) > 1.
+ bool bNotBalance;
+ DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
+ // Collect MBBs which has memory write. When move instructions cross MBB, skip
+ // mem inst if the MBB has memory write. To make things fast, just check
+ // mayStore and isBarrier.
+ DenseSet<MachineBasicBlock *> MemWriteMBBSet;
+};
+
+unsigned CollectMBBPressure(
+ MachineBasicBlock &MBB, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
+ RematStatus &status) {
+ // Skip processing current block if it has only debug instructions
+ if (MBB.getFirstNonDebugInstr() == MBB.end())
+ return ST->getOccupancyWithNumVGPRs(0);
+ auto BBEnd = MBB.rbegin();
+ GCNUpwardRPTracker RPTracker(*LIS);
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ if (!llvm::GetNonDebugMBBEnd(BBEnd, MBB))
+ return ST->getOccupancyWithNumVGPRs(0);
+
+ GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+ RPTracker.reset(*BBEnd, &outputLive, true);
+
+ for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) {
+ MachineInstr &MI = (*I++);
+ RPTracker.recede(MI);
+ if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH))
+ status.MemWriteMBBSet.insert(&MBB);
+ }
+
+ GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
+ unsigned sPressure = RP.getMaxSGPR();
+ if (sPressure > maxSPressure) {
+ maxSPressure = sPressure;
+ }
+ if (RP.getVGPRNum(ST->hasGFX90AInsts()) > maxVPressure) {
+ maxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+ }
+ status.MBBPressureMap[&MBB] = RP;
+ return RP.getOccupancy(*ST);
+}
+
+unsigned CollectFnPressure(
+ MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
+ RematStatus &status) {
+ unsigned TgtOcc = ST->getOccupancyWithLocalMemSize(MF);
+ // If only have one block, input/ouput virtual live set are empty.
+ if (MF.size() > 1) {
+ // Build input output live reg first.
+ auto *SlotIndexes = LIS->getSlotIndexes();
+ DenseMap<MachineBasicBlock *, SlotIndex> MBBInputSlotMap;
+ DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+ for (MachineBasicBlock &MBB : MF) {
+ auto BBBegin = MBB.getFirstNonDebugInstr();
+ if (BBBegin != MBB.end()) {
+ auto SI = SlotIndexes->getInstructionIndex(*BBBegin);
+ MBBInputSlotMap[&MBB] = SI;
+ }
+
+ auto BBEnd = MBB.rbegin();
+
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) {
+ auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+ MBBOutputSlotMap[&MBB] = SI;
+ }
+ }
+
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ auto Reg = Register::index2VirtReg(I);
+ if (!LIS->hasInterval(Reg))
+ continue;
+
+ LaneBitmask LiveMask;
+ const auto &LI = LIS->getInterval(Reg);
+
+ // Skip local live interval to make live input/ouput faster.
+ if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+ continue;
+
+ for (auto inputIt : MBBInputSlotMap) {
+ MachineBasicBlock *MBB = inputIt.first;
+ auto SI = inputIt.second;
+
+ auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+ if (LiveMask.any())
+ status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
+ }
+
+ for (auto outputIt : MBBOutputSlotMap) {
+ MachineBasicBlock *MBB = outputIt.first;
+ auto SI = outputIt.second;
+
+ auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+ if (LiveMask.any())
+ status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
+ }
+ }
+ }
+
+ LLVM_DEBUG(
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+ dbgs() << "output live"; for (auto &it
+ : status.MBBOutputLiveMap) {
+ unsigned Idx = it.first->getNumber();
+ auto LiveReg = it.second;
+ dbgs() << "MBB" << Idx << ":";
+ llvm::dumpLiveSet(LiveReg, SIRI);
+ } dbgs() << "input live";
+ for (auto &it
+ : status.MBBInputLiveMap) {
+ unsigned Idx = it.first->getNumber();
+ auto LiveReg = it.second;
+ dbgs() << "MBB" << Idx << ":";
+ llvm::dumpLiveSet(LiveReg, SIRI);
+ });
+
+ for (auto it = MF.begin(); it != MF.end(); ++it) {
+ MachineBasicBlock &MBB = *it;
+ unsigned Occ = CollectMBBPressure(MBB, LIS, MRI, ST, maxVPressure,
+ maxSPressure, status);
+ if (TgtOcc > Occ)
+ TgtOcc = Occ;
+ }
+ return TgtOcc;
+}
+RematStatus
+GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI, const GCNSubtarget *ST) {
+ unsigned maxSPressure = 0;
+ unsigned maxVPressure = 0;
+ RematStatus status;
+ unsigned TgtOcc = CollectFnPressure(MF, LIS, MRI, ST, maxVPressure,
+ maxSPressure, status);
+ const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+ if (TgtOcc >= MaxOcc) {
+ status.TargetOcc = TgtOcc;
+ status.TargetVLimit = 0;
+ status.TargetSLimit = 0;
+ status.MaxVPressure = 0;
+ status.MaxSPressure = 0;
+ status.InputPhysicalVPressure = 0;
+ status.InputPhysicalSPressure = 0;
+ status.bMemBound = false;
+ status.bNotBalance = false;
+ return status;
+ }
+
+ maxSPressure += RegForVCC;
+ maxVPressure = std::min(maxVPressure, ST->getMaxNumVGPRs(MF));
+ unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(maxSPressure);
+ unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(maxVPressure);
+
+ llvm::SchedScore totalScore = llvm::CollectLatency(MF, *ST, MLI);
+ bool bMemBound =
+ totalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
+
+ bool bNotBalance = false;
+
+ const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU();
+ // Currently, only sgpr bound can be fixed with remat.
+ if (STgtOcc < VTgtOcc) {
+ unsigned bigOcc = std::max(STgtOcc, VTgtOcc);
+ // Change TgtOcc to bigOcc in case sgpr and vgpr is not balance.
+ if (bigOcc > TgtOcc) {
+ TgtOcc = bigOcc;
+ bNotBalance = true;
+ if (TgtOcc >= MaxOccupancy)
+ TgtOcc = MaxOccupancy-1;
+ }
+ }
+
+ // Collect input physical pressure.
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+ unsigned vInputPressure = 0;
+ uint64_t sInputMask = 0;
+ for (const auto &livein : MRI.liveins()) {
+ const Register Reg = livein.first;
+ const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+ assert(Reg.isPhysical() && "input must be physical reg");
+ unsigned RegSize = RC->getLaneMask().getNumLanes();
+ if (SIRI->isVGPR(MRI, Reg)) {
+ vInputPressure += RegSize;
+ } else {
+ unsigned RegIndex = SIRI->getHWRegIndex(Reg);
+ uint64_t mask = ((1 << RegSize) - 1 ) << RegIndex;
+ sInputMask |= mask;
+ }
+ }
+ // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high
+ // pressure.
+ unsigned sInputPressure = 0;
+ uint64_t mask = 0xf;
+ while (mask != 0) {
+ if (mask & sInputMask) {
+ sInputPressure += 4;
+ }
+ mask = mask << 4;
+ }
+
+
+ // If balanced, try next occupancy.
+ TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1);
+
+ auto CC = MF.getFunction().getCallingConv();
+ bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS;
+ // For shader profiles other than ps/cs, set target profile max as 4.
+ if (!IsPsCs) {
+ TgtOcc = TgtOcc > 4 ? 4 : TgtOcc;
+ }
+ if (TargetOccupancy)
+ TgtOcc = TargetOccupancy;
+
+ unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true);
+ unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc);
+
+ status.TargetOcc = TgtOcc;
+ status.TargetVLimit = VLimit;
+ status.TargetSLimit = SLimit;
+ status.MaxVPressure = maxVPressure;
+ status.MaxSPressure = maxSPressure;
+ status.InputPhysicalVPressure = vInputPressure;
+ status.InputPhysicalSPressure = sInputPressure;
+ status.bMemBound = bMemBound;
+ status.bNotBalance = bNotBalance;
+ return status;
+}
+
+} // namespace
+
+// Remat.
+namespace {
+
+struct RematNode {
+ enum class RematKind {
+ Candidate, // Not ready yet.
+ OneDefOneUse,
+ Clone,
+ };
+ RematNode()
+ : Reg(0), DefMI(nullptr), Kind(RematKind::Candidate),
+ InsertPointMI(nullptr), InsertBlock(nullptr), Size(0) {}
+ RematNode(unsigned R, MachineInstr *MI, unsigned S)
+ : Reg(R), DefMI(MI), Kind(RematKind::Candidate), InsertPointMI(nullptr),
+ InsertBlock(nullptr), Size(S) {}
+ RematNode(const RematNode &N)
+ : Reg(N.Reg), DefMI(N.DefMI), Kind(N.Kind),
+ InsertPointMI(N.InsertPointMI), InsertBlock(N.InsertBlock),
+ Size(N.Size) {}
+ unsigned Reg;
+ MachineInstr *DefMI;
+ MachineBasicBlock *InsertBlock;
+ union {
+ MachineInstr *InsertPointMI;
+ unsigned UserCount;
+ };
+ RematKind Kind;
+ unsigned Size;
+};
+
+struct BlockLiveInfo {
+ MachineBasicBlock *BB;
+ unsigned maxSReg;
+ unsigned maxVReg;
+ // Input live is the live reg which cross block.
+ const GCNRPTracker::LiveRegSet inputLive;
+};
+
+// Skip live reg remated to other block.
+void UpdateLiveInfo(MapVector<unsigned, RematNode> &RematMap,
+ GCNRPTracker::LiveRegSet &LiveSet,
+ const GCNRPTracker::LiveRegSet &inputLive,
+ MachineBasicBlock *CurBB,
+ DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+ for (auto &it : RematMap) {
+ unsigned Reg = it.first;
+ // Skip reg not in live set.
+ if (!LiveSet.count(Reg))
+ continue;
+ // Skip reg already in input set.
+ // Input set will be taken care in GetReducedSize.
+ if (inputLive.count(Reg))
+ continue;
+
+ auto &Node = it.second;
+ if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+ MachineBasicBlock *InsertBB = Node.InsertBlock;
+ // If LiveInfo.BB is after InsertBB in Reverse post order, the def is
+ // still before LiveInfo.BB, it is still live.
+ unsigned LiveBBIndex = RPOTIndexMap[CurBB];
+ unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+ if (LiveBBIndex > InsertBBIndex) {
+ continue;
+ }
+ }
+ // Already in remat map, don't need to check again, remove from
+ // candidate.
+ LiveSet.erase(Reg);
+ }
+}
+
+int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
+
+ // Find shared operand in ReducedInsts.
+ int SharedSize = 0;
+ DenseMap<unsigned, LaneBitmask> SharedRegMaskMap;
+ for (MachineInstr *DefMI : ReducedInsts) {
+ for (MachineOperand &MO : DefMI->operands()) {
+ if (MO.isImm())
+ continue;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ if (MO.isTied())
+ continue;
+ Register Reg = MO.getReg();
+
+ if (Reg == AMDGPU::EXEC)
+ continue;
+ if (!Reg.isVirtual())
+ continue;
+
+ bool isVGPR = SIRI->isVGPR(MRI, MO.getReg());
+ if (bVGPR != isVGPR) {
+ // Not support mix of v and s when remat now.
+ continue;
+ }
+
+ const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+ int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+ unsigned Mask;
+ if (unsigned SubIdx = MO.getSubReg()) {
+ OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+ int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+ Mask = (1 << SubMOSize) - 1;
+ } else {
+ Mask = (1 << MOSize) - 1;
+ }
+ auto SharedRegIt = SharedRegMaskMap.find(Reg);
+ if (SharedRegIt == SharedRegMaskMap.end()) {
+ SharedRegMaskMap[Reg] = LaneBitmask(Mask);
+ } else {
+ unsigned PrevMask = SharedRegIt->second.getAsInteger();
+ if (unsigned SharedMask = (PrevMask & Mask)) {
+ // Some thing is shared.
+ for (int i = 0; i < MOSize; i++) {
+ if (SharedMask & (1 << i)) {
+ SharedSize += 1;
+ }
+ }
+ }
+ LaneBitmask MoMask = LaneBitmask(Mask | PrevMask);
+ SharedRegMaskMap[Reg] = MoMask;
+ }
+ }
+ }
+ return SharedSize;
+}
+
+int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
+ GCNRPTracker::LiveRegSet &CanidateSet,
+ InstSet &ReducedInsts,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ BlockLiveInfo &LiveInfo,
+ DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+ int ReducedSize = 0;
+ for (auto &it : RematMap) {
+ unsigned Reg = it.first;
+
+ if (!CanidateSet.count(Reg))
+ continue;
+
+ bool bReduced = false;
+ auto &Node = it.second;
+ if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+ MachineBasicBlock *InsertBB = Node.InsertBlock;
+ // If LiveInfo.BB is before InsertBB in Reverse post order, the def is
+ // moved after LiveInfo.BB, it is not live anymore.
+ unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB];
+ unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+ if (LiveBBIndex < InsertBBIndex)
+ bReduced = true;
+ } else {
+ // Clone.
+ bReduced = true;
+ // If has use in LiveInfo.BB, could not reduce from input live.
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ if (UseMI.getParent() == LiveInfo.BB) {
+ bReduced = false;
+ break;
+ }
+ }
+ }
+ if (bReduced) {
+ ReducedSize += Node.Size;
+ ReducedInsts.insert(Node.DefMI);
+ }
+
+ // Already in remat map, don't need to check again, remove from candidate.
+ CanidateSet.erase(Reg);
+ }
+
+ return ReducedSize;
+}
+
+int RematGain(MachineInstr *DefMI, unsigned Reg,
+ GCNRPTracker::LiveRegSet &CandidateRegSet,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ bool bVGPR) {
+ int rematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+ for (MachineOperand &MO : DefMI->operands()) {
+ if (MO.isImm())
+ continue;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ if (MO.isTied())
+ continue;
+
+ if (MO.getReg() == AMDGPU::EXEC)
+ continue;
+
+ // Don't move user of VCC.
+ if (MO.getReg() == AMDGPU::VCC) {
+ rematSize = 0;
+ break;
+ }
+ Register Reg = MO.getReg();
+
+ // Don't move physical register use.
+ if (Reg.isPhysical()) {
+ rematSize = 0;
+ break;
+ }
+
+ bool isVGPR = SIRI->isVGPR(MRI, Reg);
+ if (bVGPR != isVGPR) {
+ // Not support mix of v and s when remat now.
+ // TODO: count possible pressure change here.
+ rematSize = 0;
+ break;
+ }
+ bool bSingleDef = MRI.hasOneDef(Reg);
+ if (!bSingleDef) {
+ bSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI);
+ }
+
+ if (bSingleDef) {
+ // The reg might share with other candidates, but not check it here.
+ // Count share reg in GetReducedSize.
+ if (EnableAggressive) {
+ // In case of aggressive remat, treat multi use reg as shared reg and
+ // ignore size of shared reg.
+ if (!MRI.hasOneNonDBGUse(Reg))
+ continue;
+ }
+ const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+ if (unsigned SubIdx = MO.getSubReg()) {
+ if (OpRC)
+ OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+ }
+ int inputSize = SIRI->getRegSizeInBits(*OpRC);
+ // If input not live in hotspot, move it cross hotspot should have
+ // less reg then DefMi.
+ if (rematSize > inputSize) {
+ rematSize -= inputSize;
+ continue;
+ }
+ }
+
+ rematSize = 0;
+ break;
+ }
+ return rematSize;
+}
+
+void BuildRematCandiates(std::vector<RematNode> &Candidates,
+ GCNRPTracker::LiveRegSet &CandidateRegSet,
+ DenseSet<unsigned> &PinnedRegSet,
+ const MachineRegisterInfo &MRI,
+ const SIInstrInfo *SIII, const SIRegisterInfo *SIRI,
+ bool bVGPR) {
+
+ for (auto liveRegIt : CandidateRegSet) {
+ unsigned Reg = liveRegIt.first;
+ // Skip unsafe reg.
+ if (PinnedRegSet.count(Reg))
+ continue;
+
+ bool isVGPR = SIRI->isVGPR(MRI, Reg);
+ if (isVGPR != bVGPR)
+ continue;
+ bool bSafeCandidate = true;
+ MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+ if (MI) {
+ if (bVGPR) {
+ // Only remat valu now.
+ if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY)
+ bSafeCandidate = false;
+ if (MI->getOpcode() == AMDGPU::COPY) {
+ // Make sure src is unique define.
+ if (MI->getOperand(1).isReg() &&
+ nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg()))
+ bSafeCandidate = false;
+ } else {
+ // Skip convergent valu.
+ if (MI->isConvergent())
+ bSafeCandidate = false;
+ }
+ }
+ // Skip inst has more than 1 def.
+ if (MI->getDesc().NumDefs > 1)
+ bSafeCandidate = false;
+ } else {
+ bSafeCandidate = false;
+ }
+
+ if (bSafeCandidate) {
+ int gain = RematGain(MI, Reg, CandidateRegSet, MRI, SIRI, bVGPR);
+ if (gain > 0) {
+ Candidates.emplace_back(RematNode(Reg, MI, gain >> 5));
+ } else {
+ bSafeCandidate = false;
+ }
+ }
+ // Save unsafe reg.
+ if (!bSafeCandidate)
+ PinnedRegSet.insert(Reg);
+ }
+
+ // Sort by gain.
+ std::sort(Candidates.begin(), Candidates.end(),
+ [](RematNode &i, RematNode &j) { return i.Size > j.Size; });
+}
+
+// For case like
+// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, implicit-def dead $scc; xb.uniform
+// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; xb.uniform
+// %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit killed $scc; xb.uniform
+// Sink S_AND right before S_CSELECT will overwrite SCC.
+// To avoid it, skip case when DefMI and UseMI has implicit define use.
+bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
+ if (DefMI->getDesc().NumImplicitDefs == 0)
+ return false;
+
+ auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo();
+ for (MachineOperand &def : DefMI->implicit_operands()) {
+ if (!def.isReg())
+ continue;
+ if (def.isUse())
+ continue;
+ unsigned Reg = def.getReg();
+ if (UseMI->readsRegister(Reg, TRI))
+ return true;
+ }
+ return false;
+}
+
+void AddOneDefOneUseCandidate(RematNode &Node,
+ std::vector<RematNode> &RematList,
+ MachineRegisterInfo &MRI, int &rematCnt,
+ MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT,
+ MachineLoopInfo *MLI, bool bVGPR,
+ bool bMemBound) {
+ unsigned Reg = Node.Reg;
+ MachineInstr *DefMI = Node.DefMI;
+
+ unsigned size = Node.Size;
+ MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin();
+ MachineBasicBlock *InsertBB = UseMI->getParent();
+
+ // For VGPR, always move next to the only user to avoid wqm or exec issue.
+ // But doing this will cause issue when DefMI is in wqm but single user not in
+ // wqm. Disable VGPR remat for now.
+ // TODO: make sure single user don't need wqm.
+ if (!bVGPR) {
+ if (MachineBasicBlock *NewInsertBB =
+ FindInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, bMemBound)) {
+ if (InsertBB != NewInsertBB) {
+ InsertBB = NewInsertBB;
+ // If can find a non-loop insert block, go to the insert block.
+ if (DefMI->getParent() != InsertBB) {
+ if (!InsertBB->empty()) {
+ auto it = InsertBB->getFirstNonPHI();
+ it = skipDebugInstructionsForward(it, InsertBB->end());
+ if (it == InsertBB->end())
+ UseMI = nullptr;
+ else
+ UseMI = &*it;
+ }
+ }
+ }
+ }
+ }
+
+ if (bVGPR) {
+ // Don't count reg in same block for valu.
+ if (UseMI->getParent() == DefMI->getParent())
+ return;
+ }
+
+ // Skip case when DefMI has implicit define which used by UseMI.
+ if (isImplicitDefUse(DefMI, UseMI)) {
+ return;
+ }
+
+ Node.InsertBlock = InsertBB;
+ Node.InsertPointMI = UseMI;
+ Node.Kind = RematNode::RematKind::OneDefOneUse;
+ RematList.emplace_back(Node);
+ rematCnt += size;
+}
+
+void AddCloneCandidate(std::vector<RematNode *> &cloneList,
+ std::vector<RematNode> &RematList,
+ DenseSet<unsigned> &PinnedRegSet,
+ MachineRegisterInfo &MRI, int &rematCnt,
+ SlotIndexes *SlotIndexes, MachineFunction &MF) {
+ // Group user in same blocks.
+ std::vector<BlockSet> UserSetList(cloneList.size());
+
+ for (int i = 0; i < cloneList.size(); i++) {
+ auto *Node = cloneList[i];
+ unsigned Reg = Node->Reg;
+ MachineInstr *DefMI = Node->DefMI;
+ // Group user in same blocks.
+ BlockSet &UserSet = UserSetList[i];
+
+ for (auto useIt = MRI.use_instr_nodbg_begin(Reg);
+ useIt != MRI.use_instr_nodbg_end();) {
+ MachineInstr &UseMI = *(useIt++);
+ UserSet.insert(UseMI.getParent());
+ }
+
+ if (UserSet.size() == 1) {
+ // All users are in same block with DefMI.
+ if (*UserSet.begin() == DefMI->getParent()) {
+ // Mark cannot remat for now.
+ // TODO: try to split if is bigger than 4 and only used once per
+ // channel.
+ PinnedRegSet.insert(Reg);
+ continue;
+ }
+ }
+
+ int size = Node->Size;
+ size <<= 16;
+ // Pack userSet size to size.
+ size |= UserSet.size();
+ Node->UserCount = size;
+ }
+
+ std::sort(cloneList.begin(), cloneList.end(),
+ // Sort based on userSet size.
+ [](const RematNode *a, const RematNode *b) {
+ static constexpr int mask = 0xffff;
+ return (a->UserCount & mask) < (b->UserCount & mask);
+ });
+
+ for (RematNode *Node : cloneList) {
+ Node->Kind = RematNode::RematKind::Clone;
+ RematList.emplace_back(*Node);
+ rematCnt += Node->Size;
+ }
+}
+
+int FilterRematCandiates(std::vector<RematNode> &Candidates,
+ std::vector<RematNode> &RematList,
+ DenseSet<unsigned> &PinnedRegSet,
+ MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ MachineFunction &MF, SlotIndexes *SlotIndexes,
+ bool bVGPR, bool bMemBound) {
+ int rematCnt = 0;
+ // Work one def one use first.
+ for (auto &Node : Candidates) {
+ unsigned Reg = Node.Reg;
+ if (!MRI.hasOneNonDBGUse(Reg)) {
+ continue;
+ }
+ MachineInstr *DefMI = Node.DefMI;
+ if (!IsSafeToMove(DefMI, MRI)) {
+ PinnedRegSet.insert(Reg);
+ continue;
+ }
+
+ AddOneDefOneUseCandidate(Node, RematList, MRI, rematCnt, DT, PDT, MLI,
+ bVGPR, bMemBound);
+ }
+
+ if (!bVGPR) {
+ std::vector<RematNode *> cloneList;
+ // Try multi use case.
+ for (auto &Node : Candidates) {
+ unsigned Reg = Node.Reg;
+ if (MRI.hasOneNonDBGUse(Reg)) {
+ continue;
+ }
+ MachineInstr *DefMI = Node.DefMI;
+ if (!IsSafeToMove(DefMI, MRI)) {
+ PinnedRegSet.insert(Reg);
+ continue;
+ }
+
+ // Clone for each user.
+ cloneList.emplace_back(&Node);
+ }
+
+ AddCloneCandidate(cloneList, RematList, PinnedRegSet, MRI, rematCnt,
+ SlotIndexes, MF);
+ }
+
+ return rematCnt;
+}
+
+void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
+ SmallVector<MachineInstr *, 2> &userMIs) {
+ for (MachineInstr *UseMI : userMIs) {
+ for (MachineOperand &MO : UseMI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == Reg) {
+ MO.setReg(NewReg);
+ if (bSubRegDef)
+ MO.setSubReg(0);
+ }
+ }
+ }
+}
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+ unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &userBlocks,
+ DenseSet<MachineBasicBlock *> &UserMBBSet,
+ std::vector<BlockLiveInfo> &hotBlocks, MachineDominatorTree *pDT) {
+ // Collect hot blocks which Exp is live in.
+ DenseSet<MachineBasicBlock *> hotBlockSet;
+ for (BlockLiveInfo &hotBlock : hotBlocks) {
+ if (hotBlock.inputLive.count(Reg)) {
+ hotBlockSet.insert(hotBlock.BB);
+ }
+ }
+
+
+ // For userBlocks which dominate all hotBlocks, don't need to clone because
+ // the value not cross hotBlocks when later blocks are cloned.
+ // For userBlocks which dominated by all hotBlocks, they could share clones
+ // because once after hot block, the pressure is OK.
+ DenseSet<MachineBasicBlock *> afterHotRangeMBBs;
+ for (MachineBasicBlock *MBB : UserMBBSet) {
+ // Always clone in hot block.
+ if (hotBlockSet.count(MBB))
+ continue;
+
+ bool bDomAllHotBlocks = true;
+ bool bDomedByAllHotBlocks = true;
+ for (MachineBasicBlock *hotMBB : hotBlockSet) {
+ if (!pDT->dominates(MBB, hotMBB)) {
+ bDomAllHotBlocks = false;
+ }
+ if (!pDT->dominates(hotMBB, MBB)) {
+ bDomedByAllHotBlocks = false;
+ }
+ if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) {
+ break;
+ }
+ }
+ if (bDomAllHotBlocks) {
+ userBlocks.erase(MBB);
+ } else if (bDomedByAllHotBlocks) {
+ afterHotRangeMBBs.insert(MBB);
+ }
+ }
+
+ // Split after hotRange block set by domtree.
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+ if (!afterHotRangeMBBs.empty()) {
+ for (auto it : afterHotRangeMBBs) {
+ MachineBasicBlock *MBB = it;
+ for (auto it2 : afterHotRangeMBBs) {
+ MachineBasicBlock *MBB2 = it2;
+ if (MBB == MBB2)
+ continue;
+ if (pDT->dominates(MBB, MBB2)) {
+ auto &Dom = DomMap[MBB];
+ Dom.insert(MBB2);
+ auto &Dom2 = DomMap[MBB2];
+ Dom.insert(Dom2.begin(), Dom2.end());
+ }
+ }
+ }
+ for (auto it : afterHotRangeMBBs) {
+ MachineBasicBlock *MBB = it;
+ auto &Dom = DomMap[MBB];
+ for (MachineBasicBlock *domedMBB : Dom) {
+ // Remove domedMBB.
+ DomMap.erase(domedMBB);
+ UserMBBSet.erase(domedMBB);
+ }
+ }
+ }
+
+ return DomMap;
+}
+
+// Look for an earlier insert point if the InstructionToMove
+// writes to scc and scc is live at the CurrentInsertPoint.
+static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash(
+ MachineInstr *InstructionToMove,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator CurrentInsertPoint,
+ MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII
+)
+{
+ const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+ if (WillSmashScc)
+ {
+ CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
+ CurrentInsertPoint,
+ SIRI,
+ SIII,
+ &MRI
+ );
+ }
+
+ return CurrentInsertPoint;
+}
+
+// Look for an earlier insert point if the SubExp
+// writes to scc and scc is live at the CurrentInsertPoint.
+static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash(
+ const SubExp &SubExpToMove,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator CurrentInsertPoint,
+ MachineRegisterInfo& MRI,
+ const SIRegisterInfo* SIRI,
+ const SIInstrInfo* SIII
+)
+{
+ const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
+ if (WillSmashScc)
+ {
+ CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
+ CurrentInsertPoint,
+ SIRI,
+ SIII,
+ &MRI
+ );
+ }
+
+ return CurrentInsertPoint;
+}
+
+// Return trun if moving MI to Location will smash a live scc value.
+static bool WillSmashSccAtLocation(
+ MachineInstr* MI,
+ MachineBasicBlock* MBB,
+ MachineBasicBlock::iterator Location
+)
+{
+ // It is ok to pass nullptr to `modifiesRegister` for TRI here since
+ // SCC has no subreg/suprereg relationships.
+ return MI->modifiesRegister(AMDGPU::SCC, nullptr)
+ && llvm::IsSccLiveAt(MBB, Location);
+}
+
+void ApplyCloneRemat(Remat *Remat,
+ RematNode &Node, std::vector<BlockLiveInfo> &hotBlocks,
+ MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF) {
+ unsigned Reg = Node.Reg;
+
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ auto DefOp = DefMI->getOperand(0);
+ const MCInstrDesc &Desc = DefMI->getDesc();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ // When the unique def has subReg, just create newReg for the subReg part.
+ bool bSubRegDef = false;
+ if (DefOp.getSubReg() != 0) {
+ RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
+ bSubRegDef = true;
+ }
+ const DebugLoc DL = DefMI->getDebugLoc();
+ unsigned OpNum = DefMI->getNumOperands();
+
+ Node.Kind = RematNode::RematKind::Clone;
+
+ // Group user in same blocks.
+ BlockMap<SmallVector<MachineInstr *, 2>> UserMap;
+ DenseSet<MachineBasicBlock *> UserMBBSet;
+ for (auto useIt = MRI.use_instr_nodbg_begin(Reg);
+ useIt != MRI.use_instr_nodbg_end();) {
+ MachineInstr &UseMI = *(useIt++);
+ UserMap[UseMI.getParent()].emplace_back(&UseMI);
+ UserMBBSet.insert(UseMI.getParent());
+ }
+
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+ reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, pDT);
+
+ for (auto useIt : UserMap) {
+ MachineBasicBlock *MBB = useIt.first;
+ // Skip same block uses.
+ if (MBB == DefMI->getParent()) {
+ continue;
+ }
+ // Skip MBB which share clone from other MBBs.
+ if (UserMBBSet.count(MBB) == 0)
+ continue;
+
+ unsigned NewReg = MRI.createVirtualRegister(RC);
+ auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
+ for (unsigned i = 1; i < OpNum; i++) {
+ NewDef = NewDef.add(DefMI->getOperand(i));
+ }
+
+ MachineInstr *InsertPointMI = useIt.second.front();
+ SlotIndex lastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
+
+ for (MachineInstr *UseMI : useIt.second) {
+ SlotIndex slot = SlotIndexes->getInstructionIndex(*UseMI);
+ if (lastSlot > slot) {
+ lastSlot = slot;
+ InsertPointMI = UseMI;
+ }
+ }
+
+ MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash(
+ DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII
+ );
+
+ for (MachineMemOperand *MO : DefMI->memoperands()) {
+ NewDef->addMemOperand(MF, MO);
+ }
+
+ MBB->insert(InsertPoint, NewDef);
+
+ SlotIndexes->insertMachineInstrInMaps(*NewDef);
+
+ SmallVector<MachineInstr *, 2> &userMIs = useIt.second;
+ updateUsers(Reg, NewReg, bSubRegDef, userMIs);
+
+ // update users in dom MBBs.
+ auto domMapIt = DomMap.find(MBB);
+ if (domMapIt != DomMap.end()) {
+ for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
+ SmallVector<MachineInstr *, 2> &userMIs = UserMap[UpdateMBB];
+ updateUsers(Reg, NewReg, bSubRegDef, userMIs);
+ }
+ }
+
+ llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes);
+ }
+ if (MRI.use_empty(Reg)) {
+ SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+ Remat->RemoveInst(DefMI);
+ DefMI->eraseFromParent();
+ }
+}
+
+void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+ SlotIndexes *slotIndexes,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ MachineInstr *DefMI = Node.DefMI;
+ MachineInstr *InsertPointMI = Node.InsertPointMI;
+ MachineBasicBlock* MBB = nullptr;
+
+ // Find a valid insert point.
+ MachineBasicBlock::iterator InsertPoint;
+ if (InsertPointMI) {
+ InsertPoint = InsertPointMI->getIterator();
+ MBB = InsertPointMI->getParent();
+ } else {
+ InsertPoint = Node.InsertBlock->getFirstTerminator();
+ MBB = Node.InsertBlock;
+ }
+
+ InsertPoint = AdjustInsertPointToAvoidSccSmash(
+ DefMI, MBB, InsertPoint, MRI, SIRI, SIII
+ );
+
+ // Move instruction to new location.
+ DefMI->removeFromParent();
+ InsertPoint->getParent()->insert(InsertPoint, DefMI);
+
+ // Update slot index.
+ slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+ slotIndexes->insertMachineInstrInMaps(*DefMI);
+}
+
+void ApplyRemat(Remat *Remat, MapVector<unsigned, RematNode> &RematMap,
+ std::vector<BlockLiveInfo> &hotBlocks,
+ MachineDominatorTree *pDT, SlotIndexes *slotIndexes,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF) {
+ std::vector<RematNode> UpdateList;
+ for (auto &it : RematMap) {
+ UpdateList.emplace_back(it.second);
+ }
+ // Sort update list with slotIndex to make sure def moved before use.
+ // If use moved before def, it might not be the first use anymore.
+ std::sort(UpdateList.begin(), UpdateList.end(),
+ [&slotIndexes](RematNode &i, RematNode &j) {
+ SlotIndex a = slotIndexes->getInstructionIndex(*i.DefMI);
+ SlotIndex b = slotIndexes->getInstructionIndex(*j.DefMI);
+ return a < b;
+ });
+
+ for (RematNode &Node : UpdateList) {
+ if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+ ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII);
+ } else if (Node.Kind == RematNode::RematKind::Clone) {
+ ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, MF);
+ }
+ }
+}
+
+void dumpRematMap(MapVector<unsigned, RematNode> &RematMap,
+ const SIRegisterInfo *SIRI) {
+ dbgs() << "\n rematMap: \n";
+ for (auto it : RematMap) {
+ int Reg = it.first;
+ dbgs() << printReg(Reg, SIRI);
+ dbgs() << "\n";
+ }
+}
+
+int DebugBlockIndex = 42;
+
+void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet,
+ MapVector<unsigned, RematNode> &VRematMap,
+ MapVector<unsigned, RematNode> &SRematMap, int BlockIndex,
+ const SIRegisterInfo *SIRI) {
+ if (DebugBlockIndex != BlockIndex)
+ return;
+ llvm::dumpLiveSet(LiveSet, SIRI);
+ dumpRematMap(VRematMap, SIRI);
+ dumpRematMap(SRematMap, SIRI);
+}
+
+void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
+ const SIRegisterInfo *SIRI) {
+ if (DebugBlockIndex != BlockIndex)
+ return;
+ dbgs() << "\n Candidates: \n";
+ unsigned TotalSize = 0;
+ for (RematNode &Node : RematCandidates) {
+ dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size;
+ dbgs() << "\n";
+ TotalSize += Node.Size;
+ }
+ dbgs() << "Total Size:" << TotalSize << "\n";
+}
+
+} // namespace
+
+bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, MachineDominatorTree *pDT,
+ MachinePostDominatorTree *pPDT, bool &bNearTarget) {
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+ const SIInstrInfo *SIII = ST->getInstrInfo();
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+ ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+ DenseMap<MachineBasicBlock *, unsigned> RPOTIndexMap;
+ for (MachineBasicBlock *MBB : RPOT) {
+ RPOTIndexMap[MBB] = RPOTIndexMap.size();
+ }
+
+ auto &MRI = MF.getRegInfo();
+
+ bool bUpdated = false;
+ RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+
+ const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+ if (status.TargetOcc >= MaxOcc)
+ return false;
+
+ unsigned VLimit = status.TargetVLimit;
+ unsigned SLimit = status.TargetSLimit;
+
+ int rematSCnt = status.MaxSPressure - SLimit;
+ // when agressive sgpr remat, reserve some for allocation lost.
+ if (EnableAggressive)
+ rematSCnt += NearTargetRegLimit;
+
+ bool bSGPRSpill = false;
+ if (rematSCnt > 0) {
+ bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+ }
+
+ bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
+
+ // If bound by lds, skip.
+ if (status.TargetOcc > ST->getOccupancyWithLocalMemSize(MF) &&
+ !bForceRematSgpr)
+ return false;
+
+ MachineBasicBlock *EntryMBB = &MF.front();
+
+ auto *SlotIndexes = LIS->getSlotIndexes();
+
+ // Reg which already marked remat.
+ MapVector<unsigned, RematNode> VRematMap;
+ MapVector<unsigned, RematNode> SRematMap;
+ // Reg which cannot move around to remat.
+ DenseSet<unsigned> PinnedRegSet;
+ std::vector<BlockLiveInfo> hotBlocks;
+ for (auto it = po_begin(EntryMBB); it != po_end(EntryMBB); it++) {
+ MachineBasicBlock *MBB = *it;
+ auto &RP = status.MBBPressureMap[MBB];
+ // ignore block not hot.
+ if (RP.getVGPRNum(ST->hasGFX90AInsts()) < status.TargetVLimit &&
+ (RP.getMaxSGPR() + RegForVCC + status.InputPhysicalSPressure) <
+ status.TargetSLimit)
+ continue;
+ // Collect reg pressure.
+ unsigned maxVPressure = 0;
+ unsigned maxSPressure = 0;
+ const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+
+ const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+ LLVM_DEBUG(
+ dumpHotBlock(inputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI));
+
+ GCNDownwardRPTracker Tracker(*LIS);
+
+ Tracker.reset(*MBB->begin(), &inputLive);
+
+ for (MachineInstr &MI : *MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ Tracker.advance();
+ auto LISLR = Tracker.getLiveRegs();
+ // Update live set for things already remated.
+ UpdateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+ UpdateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+
+ const GCNRPTracker::LiveRegSet &liveSet = LISLR;
+ unsigned VPressure = 0;
+ unsigned SPressure = 0;
+ CollectLiveSetPressure(liveSet, MRI, SIRI, VPressure, SPressure);
+ if (maxVPressure < VPressure)
+ maxVPressure = VPressure;
+ if (maxSPressure < SPressure)
+ maxSPressure = SPressure;
+ }
+ maxSPressure += RegForVCC + status.InputPhysicalSPressure;
+ if (maxVPressure <= VLimit && maxSPressure <= SLimit)
+ continue;
+
+ // Build block live info.
+ // Use outputLive for EntryMBB.
+ BlockLiveInfo LiveInfo = {MBB, maxSPressure, maxVPressure,
+ MBB != EntryMBB ? inputLive : outputLive};
+ // Skip entry block when save hotBlock to reduce clone because not clone in
+ // entry block.
+ if (MBB != EntryMBB)
+ hotBlocks.emplace_back(LiveInfo);
+ GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.inputLive;
+
+ // Update reg pressure based on remat list.
+ InstSet VReducedInsts;
+ InstSet SReducedInsts;
+ int VReduced =
+ GetReducedSize(VRematMap, /*bVGPR*/ true, CandidateRegs, VReducedInsts,
+ MRI, SIRI, LiveInfo, RPOTIndexMap);
+ int SReduced =
+ GetReducedSize(SRematMap, /*bVGPR*/ false, CandidateRegs, SReducedInsts,
+ MRI, SIRI, LiveInfo, RPOTIndexMap);
+
+ // Calculate size need to be remat.
+ int rematVCnt = maxVPressure - VReduced - VLimit;
+ int rematSCnt = maxSPressure - SReduced - SLimit;
+
+ bool bSGPRSpill = false;
+ if (rematSCnt > 0) {
+ bSGPRSpill = nearSgprSpill(maxSPressure, ST, MF);
+ }
+ bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
+ // Try to add candidates into remat list.
+
+ int newRematSCnt = 0;
+ if (rematSCnt > 0) {
+ // Build candidate nodes.
+ std::vector<RematNode> SRematCandidates;
+ BuildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
+ SIII, SIRI, /*bVGPR*/ false);
+
+ LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
+ std::vector<RematNode> SRematList;
+ // Filter candidates.
+ newRematSCnt =
+ FilterRematCandiates(SRematCandidates, SRematList, PinnedRegSet, pDT,
+ pPDT, MLI, MRI, SIRI, MF, SlotIndexes,
+ /*bVGPR*/ false, status.bMemBound);
+ if (newRematSCnt > rematSCnt) {
+ // Has enough remat node to cover rematCnt.
+ int rematCnt = 0;
+ for (RematNode &Node : SRematList) {
+ SRematMap[Node.Reg] = Node;
+ rematCnt += Node.Size;
+ if (rematCnt > rematSCnt && !EnableAggressive)
+ break;
+ }
+ newRematSCnt = 0;
+ } else {
+
+ for (RematNode &Node : SRematList) {
+ SReducedInsts.insert(Node.DefMI);
+ }
+ // Check shared size.
+ int SharedReducedSize =
+ GetSharedReducedSize(SReducedInsts, /*bVGPR*/ false, MRI, SIRI);
+ if (((newRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
+ rematSCnt) {
+ for (RematNode &Node : SRematList) {
+ SRematMap[Node.Reg] = Node;
+ }
+ } else {
+ if (!bForceRematSgpr) {
+ return false;
+ } else {
+ for (RematNode &Node : SRematList) {
+ SRematMap[Node.Reg] = Node;
+ }
+ // Find local one def one use candidates.
+ for (MachineInstr &MI : *MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ if (MI.getDesc().NumDefs != 1)
+ continue;
+ MachineOperand &DstMO = MI.getOperand(0);
+ Register Reg = DstMO.getReg();
+ if (!SIRI->isSGPRReg(MRI, Reg))
+ continue;
+ if (!MRI.hasOneNonDBGUse(Reg))
+ continue;
+ if (!MRI.hasOneDef(Reg))
+ continue;
+ if (Reg.isPhysical())
+ continue;
+ MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
+ if (UseMI.getParent() != MBB)
+ continue;
+ int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, /*bVGPR*/false);
+ if (gain > 0) {
+ // Skip case when DefMI has implicit define which used by UseMI.
+ if (isImplicitDefUse(&MI, &UseMI)) {
+ continue;
+ }
+ RematNode Node = {Reg, &MI, (unsigned)gain >> 5};
+ Node.InsertPointMI = &UseMI;
+ Node.Kind = RematNode::RematKind::OneDefOneUse;
+ SRematMap[Reg] = Node;
+ SharedReducedSize += Node.Size;
+ }
+ }
+ }
+ }
+ newRematSCnt = rematSCnt - newRematSCnt - SharedReducedSize;
+ }
+ }
+ // If works, continue.
+
+ // Collect live range from hot inst.
+ // find common live range in hot insts.
+ // Remat these common live range.
+ // Apply the remat.
+
+ int newRematVCnt = 0;
+ if (rematVCnt > 0) {
+ // TODO: V remat.
+ }
+
+ bool bNeedSRemat = rematSCnt > 0;
+ bool bNeedVRemat = rematVCnt > 0;
+ // If sgpr spill, always do remat.
+ bool bSRematOK =
+ (newRematSCnt <= 0 && !SRematMap.empty()) ||
+ bForceRematSgpr;
+ bool bVRematOK =
+ (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty();
+ if (bNeedSRemat && bNeedVRemat) {
+ if (bVRematOK && bSRematOK) {
+ bUpdated = true;
+ } else if (bSGPRSpill) {
+ bUpdated = true;
+ }
+ } else if (bNeedSRemat) {
+ if (bSRematOK) {
+ bUpdated = true;
+ }
+ } else if (bNeedVRemat) {
+ if (bVRematOK) {
+ bUpdated = true;
+ }
+ }
+ // TODO: what to do when cannot reach target?
+ if (newRematSCnt > 0) {
+ if (newRematSCnt <= NearTargetRegLimit) {
+ bNearTarget = true;
+ } else {
+ if (!bSGPRSpill)
+ return false;
+ }
+ }
+ }
+
+ if (SRematMap.empty() && VRematMap.empty()) {
+ return bUpdated;
+ }
+
+ if (!SRematMap.empty()) {
+ bUpdated = true;
+ ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, MF);
+ LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
+ }
+
+ // Balance between vector and scalar if possible.
+ return bUpdated;
+}
+
+namespace {
+bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+ DenseSet<MachineInstr *> DefMIs;
+ for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+ // skip implicit def.
+ if (DefMI.getOpcode() == AMDGPU::IMPLICIT_DEF)
+ continue;
+ DefMIs.insert(&DefMI);
+ }
+ return DefMIs.size() == 1;
+}
+
+static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg)
+{
+ if (!MO.isImplicit() || !MO.isUse() || !MO.isReg())
+ {
+ return false;
+ }
+
+ return MO.getReg() == Reg;
+}
+
+static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg)
+{
+ if (!MO.isImplicit() || !MO.isDef() || !MO.isReg())
+ {
+ return false;
+ }
+
+ return MO.getReg() == Reg;
+}
+
+static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII)
+{
+ // Make sure UseMI is not wqm like sample.
+ if (SIII->isWQM(UseMI->getOpcode()))
+ return false;
+ if (UseMI->getOpcode() == AMDGPU::PHI)
+ return false;
+
+ return true;
+}
+
+static bool isConvergent(Remat *Remat, const MachineInstr &MI) {
+ return MI.isConvergent() &&
+ // This flag is set on readfirstlane's to indicate that they
+ // are redundant (the value being read is already uniform).
+ // Normally, readfirstlanes are convergent, because different exec
+ // will cause a different value to be read; a known uniform
+ // readfirstlane is safe to move or clone and not actually convergent.
+ !Remat->TotalUniformInsts.count(&MI);
+}
+
+bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, bool bSink) {
+ if (Reg.isPhysical())
+ return false;
+ bool bVGPR = SIRI->isVGPR(MRI, Reg);
+
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ if (!DefMI)
+ return false;
+ if (DefMI->getOpcode() == AMDGPU::PHI)
+ return false;
+
+ // Skip convergent.
+ if (isConvergent(Remat, *DefMI))
+ return false;
+
+ // Skip inst has more than 1 def.
+ if (DefMI->getDesc().NumDefs > 1)
+ return false;
+
+ unsigned OpNum = DefMI->getNumOperands();
+
+ // Only move DefMI which all operand is unique def.
+ for (unsigned i = 0; i < OpNum; i++) {
+ MachineOperand &Op = DefMI->getOperand(i);
+ if (!Op.isReg())
+ continue;
+ Register OpReg = Op.getReg();
+ if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
+ continue;
+ if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI))
+ continue;
+ // Alow unused scc define.
+ if (Op.isImplicit() && Op.isDead() && Op.isDef())
+ continue;
+ if (OpReg.isPhysical())
+ return false;
+ if (!MRI.getUniqueVRegDef(OpReg) && !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
+ return false;
+ }
+ }
+
+ if (bVGPR && bSink) {
+ // Skip mem related inst.
+ if (DefMI->mayLoadOrStore()) {
+ return false;
+ }
+
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ if (!IsSafeRematCandidateUser(&UseMI, SIII))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+std::vector<SubExp> buildSubExpFromCandidates(
+ Remat *Remat,
+ GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes,
+ GCNRPTracker::LiveRegSet &unUsedPassThrus,
+ bool bAllowPartialUseInSubExp) {
+ InstSet CandidateDefs;
+ DenseSet<unsigned> RemovedCandidates;
+ std::vector<unsigned> CandidateRegs;
+ CandidateRegs.reserve(Candidates.size());
+ for (auto it : Candidates) {
+ unsigned Reg = it.first;
+ CandidateRegs.emplace_back(Reg);
+ }
+ // Sort candidate by defMI order to make sure defMI has dependent check after
+ // all its dependent node.
+ std::sort(CandidateRegs.begin(), CandidateRegs.end(),
+ [&MRI, &slotIndexes](const unsigned a, unsigned b) {
+ MachineInstr *MIa = MRI.getUniqueVRegDef(a);
+
+ MachineInstr *MIb = MRI.getUniqueVRegDef(b);
+ // Later instr first.
+ return !SlotIndex::isEarlierInstr(
+ slotIndexes->getInstructionIndex(*MIa),
+ slotIndexes->getInstructionIndex(*MIb));
+ });
+
+ // If Candidate def has user in MBB, add it when allow partial candidates.
+ // And the subExp has the define could only be clone, cannot move cross blocks
+ // because user in MBB.
+ DenseSet<MachineInstr *> PartialCandidates;
+ LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";);
+ for (unsigned Reg : CandidateRegs) {
+ MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+ bool bHasNoCandidatesSameBlockUser = false;
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ if (UseMI.getParent() == MI->getParent()) {
+ if (UseMI.getNumExplicitDefs() == 1) {
+ // Skip user which already in Candidates.
+ unsigned UserDefReg = UseMI.getOperand(0).getReg();
+ if (Candidates.count(UserDefReg) > 0 &&
+ RemovedCandidates.count(UserDefReg) == 0)
+ continue;
+ }
+ if (!bAllowPartialUseInSubExp)
+ bHasNoCandidatesSameBlockUser = true;
+ else
+ PartialCandidates.insert(MI);
+ break;
+ }
+ }
+ if (bHasNoCandidatesSameBlockUser) {
+ RemovedCandidates.insert(Reg);
+ continue;
+ }
+ LLVM_DEBUG(MI->dump());
+ CandidateDefs.insert(MI);
+ }
+ LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";);
+
+ if (CandidateDefs.empty())
+ return std::vector<SubExp>();
+ for (unsigned Reg : RemovedCandidates) {
+ unUsedPassThrus[Reg] = Candidates[Reg];
+ Candidates.erase(Reg);
+ }
+
+ // iterate MBB backward.
+ // add inst which only used for candidate defines.
+ for (auto it = MBB->rbegin(); it != MBB->rend(); it++) {
+ MachineInstr &MI = *it;
+ if (CandidateDefs.count(&MI) > 0) {
+ continue;
+ }
+
+ if (isConvergent(Remat, MI))
+ continue;
+ // Skip if MI is not safe to move.
+ if (MI.getNumDefs() != 1) {
+ // allow to move unused implicit def.
+ bool bDeadImplictDef = false;
+ for (MachineOperand &MO : MI.implicit_operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isDef())
+ continue;
+ bDeadImplictDef = MO.isDead();
+ }
+ if (!bDeadImplictDef)
+ continue;
+ }
+
+ unsigned Reg = -1;
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isDef())
+ continue;
+ Reg = MO.getReg();
+ break;
+ }
+
+ if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/true))
+ continue;
+
+ // If all users of MI are in candidate defs, add MI into candidate defs.
+ // If part of user of MI is in candidate defs, add MI into candidate defs
+ // when allow partialUse.
+ bool bAllUserInCandidate = true;
+ bool bHasCandidateUser = false;
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ if (CandidateDefs.count(&UseMI) == 0)
+ bAllUserInCandidate = false;
+ else
+ bHasCandidateUser = true;
+ }
+ if (!bHasCandidateUser)
+ continue;
+ if (!bAllUserInCandidate) {
+ if (!bAllowPartialUseInSubExp)
+ continue;
+ PartialCandidates.insert(&MI);
+ }
+
+ CandidateDefs.insert(&MI);
+ }
+
+ // Collect input for CandidateDefs.
+ GCNRPTracker::LiveRegSet CandidateInput;
+ for (MachineInstr *MI : CandidateDefs) {
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ if (MO.isImplicit() && Reg.isPhysical())
+ continue;
+
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) &&
+ "UseMI should be safe to move");
+ if (DefMI && CandidateDefs.count(DefMI) > 0)
+ continue;
+ // Add to input.
+ CandidateInput[Reg] |= llvm::getRegMask(MO, MRI);
+ }
+ }
+
+ // Build defs in order.
+ std::vector<MachineInstr *> defs;
+ defs.reserve(CandidateDefs.size());
+ for (MachineInstr &MI : *MBB) {
+ MachineInstr *pMI = &MI;
+ if (CandidateDefs.count(pMI) == 0)
+ continue;
+ defs.emplace_back(pMI);
+ }
+
+ LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
+ : defs) {
+ MI->dump();
+ } dbgs() << "\nFinished Candidate Defs End\n";);
+
+ // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
+ // Candidates as output.
+ ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+ dag.build(CandidateInput, Candidates, defs);
+ if (bAllowPartialUseInSubExp) {
+ for (auto &subExp : dag.SubExps) {
+ for (auto *MI : subExp.SUnits) {
+ if (PartialCandidates.count(MI)) {
+ subExp.bCloneOnly = true;
+ break;
+ }
+ }
+ }
+ }
+ return dag.SubExps;
+}
+
+
+std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
+ Remat* Remat,
+ GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) {
+ InstSet CandidateDefs;
+
+ LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";);
+ for (auto it : Candidates) {
+ unsigned Reg = it.first;
+ MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ if (isConvergent(Remat, UseMI))
+ continue;
+ MachineBasicBlock *UseMBB = UseMI.getParent();
+ if (UseMBB == MI->getParent())
+ continue;
+ assert(UseMBB == MBB && "block mismatch");
+ // If all operands in CandidateRegs, add to candidateDefs.
+ bool bHasOpRegNotInCandidates = false;
+ for (MachineOperand &MO : UseMI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ Register OpReg = MO.getReg();
+ if (MO.isImplicit() && OpReg.isPhysical())
+ continue;
+ if (Candidates.count(OpReg) == 0) {
+ bHasOpRegNotInCandidates = true;
+ break;
+ }
+ }
+ if (bHasOpRegNotInCandidates)
+ continue;
+
+ LLVM_DEBUG(UseMI.dump());
+ CandidateDefs.insert(&UseMI);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";);
+
+ if (CandidateDefs.empty())
+ return std::vector<SubExp>();
+
+ // iterate MBB.
+ GCNRPTracker::LiveRegSet LocalCandidates = Candidates;
+ // add inst which only used by candidate defines.
+ for (auto it = MBB->begin(); it != MBB->end(); it++) {
+ MachineInstr &MI = *it;
+ if (CandidateDefs.count(&MI) > 0) {
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical())
+ continue;
+ LocalCandidates[Reg];
+ }
+ continue;
+ }
+
+ // Skip if MI is not safe to move.
+ if (isConvergent(Remat, MI))
+ continue;
+
+ if (MI.getNumDefs() != 1)
+ continue;
+
+ if (MI.mayLoadOrStore()) {
+ continue;
+ }
+
+ unsigned Reg = -1;
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isDef())
+ continue;
+ Reg = MO.getReg();
+ break;
+ }
+
+ // Still use bsink to skip mem load/store.
+ // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*bSink*/true))
+ // continue;
+
+ // If all user of MI is in candidate defs, add MI into candidate defs.
+ bool bAllOperandInCandidate = true;
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ Register OpReg = MO.getReg();
+ if (LocalCandidates.count(OpReg))
+ continue;
+
+ if (MO.isImplicit() &&
+ (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO))
+ continue;
+ if (OpReg.isPhysical()) {
+ bAllOperandInCandidate = false;
+ break;
+ }
+ MachineInstr *OpMI = MRI.getUniqueVRegDef(OpReg);
+ if (!OpMI) {
+ bAllOperandInCandidate = false;
+ break;
+ }
+ if (CandidateDefs.count(OpMI) == 0) {
+ bAllOperandInCandidate = false;
+ break;
+ }
+ if (MO.isTied())
+ continue;
+ }
+ if (!bAllOperandInCandidate)
+ continue;
+ LLVM_DEBUG(llvm::dbgs() << "Add local candidates:";
+ pressure::print_reg(Reg, MRI, SIRI, llvm::dbgs()););
+ LocalCandidates[Reg];
+ CandidateDefs.insert(&MI);
+ }
+
+ // Collect input for CandidateDefs.
+ GCNRPTracker::LiveRegSet CandidateInput;
+ for (MachineInstr *MI : CandidateDefs) {
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+ continue;
+ if (Reg.isPhysical())
+ continue;
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ if (!DefMI) {
+ // Skip local def which is not unique.
+ if (MO.isTied())
+ continue;
+ if (Candidates.count(Reg) == 0 && LocalCandidates.count(Reg) != 0)
+ continue;
+ }
+ assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) &&
+ "UseMI should be safe to move");
+ if (DefMI && CandidateDefs.count(DefMI) > 0)
+ continue;
+ // Add to input.
+ CandidateInput[Reg] = llvm::getRegMask(MO, MRI);
+ }
+ }
+
+ // Build defs in order.
+ std::vector<MachineInstr *> defs;
+ defs.reserve(CandidateDefs.size());
+ for (MachineInstr &MI : *MBB) {
+ MachineInstr *pMI = &MI;
+ if (CandidateDefs.count(pMI) == 0)
+ continue;
+ defs.emplace_back(pMI);
+ }
+
+ LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
+ : defs) {
+ MI->dump();
+ } dbgs() << "\nFinished Candidate Defs End\n";);
+
+ LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it
+ : LocalCandidates) {
+ pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs());
+ } dbgs() << "\nLocalCandidates End\n";);
+ // Make sure all input reg are uniqueDef.
+ // Input is Candidates, output is?
+ // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
+ // Candidates as output.
+ ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+ dag.build(Candidates, LocalCandidates, defs);
+ return dag.SubExps;
+}
+
+
+void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
+ if (Reg.isVirtual()) {
+ StringRef Name = MRI.getVRegName(Reg);
+ if (Name != "") {
+ dbgs() << '%' << Name;
+ } else {
+ dbgs() << '%' << Register::virtReg2Index(Reg);
+ }
+ }
+}
+
+MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
+ const MachineRegisterInfo &MRI,
+ MachineDominatorTree *pDT) {
+ BlockSet userBlocks;
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ MachineBasicBlock *UserBB = UseMI.getParent();
+ // Skip current BB.
+ if (UserBB != FromBB)
+ userBlocks.insert(UserBB);
+ else
+ // When has user in FromBB, userBlock will be FromBB.
+ return nullptr;
+ }
+ if (userBlocks.empty())
+ return nullptr;
+ MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks);
+ if (!pDT->dominates(FromBB, userBlock)) {
+ return nullptr;
+ }
+ if (userBlock == FromBB)
+ return nullptr;
+ return userBlock;
+}
+
+void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
+ MachineDominatorTree *pDT,
+ SlotIndexes *slotIndexes,
+ const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI) {
+ // Move from bottom.
+ MachineBasicBlock *FromBB = Exp.FromBB;
+ for (auto it = Exp.SUnits.rbegin(); it != Exp.SUnits.rend(); it++) {
+ MachineInstr *DefMI = *it;
+ if (DefMI->getNumExplicitDefs() != 1)
+ continue;
+
+ unsigned Reg = DefMI->getOperand(0).getReg();
+ MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, pDT);
+ if (!ToBB)
+ continue;
+
+ // Do not overwrite a live scc.
+ MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin());
+ if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint))
+ continue;
+
+ DefMI->removeFromParent();
+ assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && "invalid insert point");
+ ToBB->insert(InsertPoint, DefMI);
+ // Debug insts don't need slot index.
+ if (DefMI->isDebugInstr())
+ continue;
+ // Update slot index.
+ slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+ slotIndexes->insertMachineInstrInMaps(*DefMI);
+ }
+}
+
+
+void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
+ MachineDominatorTree *pDT,
+ SlotIndexes *slotIndexes,
+ const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI) {
+ // Move from top.
+ // Find lowest input def.
+ MachineBasicBlock *ToBB = Exp.ToBB;
+ assert(!ToBB->empty() && "ToBB have instructions for define of input nodes");
+ auto Terminator = ToBB->getFirstTerminator();
+ if (Terminator == ToBB->end() && ToBB->succ_size() == 1) {
+ MachineInstr &EndMI = *ToBB->rbegin();
+ if (SIII->isSchedulingBoundary(EndMI, ToBB, *ToBB->getParent()))
+ // Insert before the scheduling boundary instruction.
+ Terminator = EndMI.getIterator();
+ else
+ // No boundary so just insert inst at the end of the block.
+ Terminator = ToBB->end();
+ }
+
+ Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(
+ Exp, ToBB, Terminator, MRI, SIRI, SIII
+ );
+
+ for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+ MachineInstr *DefMI = *it;
+ if (DefMI->getNumExplicitDefs() != 1)
+ continue;
+ if (SIII->isEXP(DefMI->getOpcode()))
+ continue;
+ if (DefMI->mayStore())
+ continue;
+ // Find def for DefMI operands as insert point.
+ DefMI->removeFromParent();
+ ToBB->insert(Terminator, DefMI);
+
+ // Debug insts don't need slot index.
+ if (DefMI->isDebugInstr())
+ continue;
+ // Update slot index.
+ slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+ slotIndexes->insertMachineInstrInMaps(*DefMI);
+ }
+}
+
+DenseSet<MachineInstr *> buildCloneSet(ExpDag &dag,
+ DenseSet<SUnit *> &dagBottoms,
+ GCNRPTracker::LiveRegSet &usedOutput) {
+ DenseSet<MachineInstr *> copySet;
+ for (auto it = dag.SUnits.rbegin(); it != dag.SUnits.rend(); it++) {
+ SUnit &SU = *it;
+ // Skip non-inst node.
+ if (!SU.isInstr())
+ continue;
+ MachineInstr *MI = SU.getInstr();
+ if (dagBottoms.find(&SU) != dagBottoms.end()) {
+ bool bUsed = false;
+ // For bottom SU, if in usedOutput, add to copySet;
+ for (MachineOperand &DefMO : MI->defs()) {
+ if (!DefMO.isReg())
+ continue;
+ unsigned Reg = DefMO.getReg();
+ if (usedOutput.count(Reg) > 0) {
+ bUsed = true;
+ break;
+ }
+ }
+ if (bUsed) {
+ copySet.insert(MI);
+ continue;
+ }
+ // bottom SU may still have succNode when it used both inExp and outExp.
+ // So continue check succNode.
+ }
+
+ // If any SuccNode is in copySet, add to copySet.
+ bool bSuccCopied = false;
+ for (SDep &SucDep : SU.Succs) {
+ SUnit *SucSU = SucDep.getSUnit();
+ MachineInstr *SuccMI = SucSU->getInstr();
+ if (copySet.count(SuccMI) > 0) {
+ bSuccCopied = true;
+ break;
+ }
+ }
+ if (bSuccCopied)
+ copySet.insert(MI);
+ }
+ return copySet;
+}
+
+void updateUsers(SmallVector<MachineInstr *, 2> &userMIs,
+ DenseMap<unsigned, unsigned> &RegMap) {
+
+ for (MachineInstr *UserMI : userMIs) {
+ for (MachineOperand &MO : UserMI->uses()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ auto it = RegMap.find(Reg);
+ if (it == RegMap.end())
+ continue;
+ unsigned NewReg = it->second;
+ MO.setReg(NewReg);
+ }
+ }
+}
+
+struct HotBlock {
+ MachineBasicBlock *MBB = nullptr;
+ GCNRPTracker::LiveRegSet inputLive;
+ std::pair<unsigned, unsigned> maxPressures;
+ // Info about vmemLd.
+ int vmemLdInputSize;
+ int vmemLdOutputSize;
+};
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+ SubExp &Exp,
+ MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &userBlocks,
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &userBlocksLiveRegs,
+ std::vector<HotBlock> &hotBlocks, MachineDominatorTree *pDT) {
+ // Collect hot blocks which Exp is live in.
+ DenseSet<MachineBasicBlock *> hotBlockSet;
+ for (HotBlock &hotBlock : hotBlocks) {
+ for (unsigned Reg : Exp.BottomRegs) {
+ if (hotBlock.inputLive.count(Reg)) {
+ hotBlockSet.insert(hotBlock.MBB);
+ break;
+ }
+ }
+ }
+
+ // For userBlocks which dominate all hotBlocks, don't need to clone because
+ // the value not cross hotBlocks when later blocks are cloned.
+ // For userBlocks which dominated by all hotBlocks, they could share clones
+ // because once after hot block, the pressure is OK.
+ DenseSet<MachineBasicBlock *> afterHotRangeMBBs;
+ for (auto it : userBlocksLiveRegs) {
+ MachineBasicBlock *MBB = it.first;
+ // Always clone in hot block.
+ if (hotBlockSet.count(MBB))
+ continue;
+
+ bool bDomAllHotBlocks = true;
+ bool bDomedByAllHotBlocks = true;
+ for (MachineBasicBlock *hotMBB : hotBlockSet) {
+ if (!pDT->dominates(MBB, hotMBB)) {
+ bDomAllHotBlocks = false;
+ }
+ if (!pDT->dominates(hotMBB, MBB)) {
+ bDomedByAllHotBlocks = false;
+ }
+ if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) {
+ break;
+ }
+ }
+ if (bDomAllHotBlocks) {
+ userBlocks.erase(MBB);
+ } else if (bDomedByAllHotBlocks) {
+ afterHotRangeMBBs.insert(MBB);
+ }
+ }
+
+ // Split after hotRange block set by domtree.
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+ if (!afterHotRangeMBBs.empty()) {
+ for (auto it : afterHotRangeMBBs) {
+ MachineBasicBlock *MBB = it;
+ for (auto it2 : afterHotRangeMBBs) {
+ MachineBasicBlock *MBB2 = it2;
+ if (MBB == MBB2)
+ continue;
+ if (pDT->dominates(MBB, MBB2)) {
+ auto &Dom = DomMap[MBB];
+ Dom.insert(MBB2);
+ auto &Dom2 = DomMap[MBB2];
+ Dom.insert(Dom2.begin(), Dom2.end());
+ }
+ }
+ }
+ for (auto it : afterHotRangeMBBs) {
+ MachineBasicBlock *MBB = it;
+ auto &usedOutput = userBlocksLiveRegs[MBB];
+ auto &Dom = DomMap[MBB];
+ for (MachineBasicBlock *domedMBB : Dom) {
+ // Merge domed use to MBB use.
+ mergeLiveRegSet(usedOutput, userBlocksLiveRegs[domedMBB]);
+ // Remove domedMBB.
+ DomMap.erase(domedMBB);
+ userBlocksLiveRegs.erase(domedMBB);
+ }
+ }
+ }
+
+ return DomMap;
+}
+
+void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
+ MachineDominatorTree *pDT,
+ MachineRegisterInfo &MRI,
+ SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI) {
+ MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> userBlocks;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> userBlocksLiveRegs;
+ for (unsigned Reg : Exp.BottomRegs) {
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ MachineBasicBlock *UserBB = UseMI.getParent();
+ // Skip current BB.
+ if (UserBB == Exp.FromBB)
+ continue;
+
+ userBlocks[UserBB].emplace_back(&UseMI);
+ auto &userLives = userBlocksLiveRegs[UserBB];
+ for (MachineOperand &MO : UseMI.uses()) {
+ if (!MO.isReg())
+ continue;
+ unsigned UseReg = MO.getReg();
+ if (Reg != UseReg)
+ continue;
+ userLives[Reg] |= getRegMask(MO, MRI);
+ }
+ }
+ }
+ // Build dag for SubExp to help remove unused inst when clone.
+ ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+ dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits);
+ DenseSet<SUnit *> dagBottoms;
+ for (SUnit &SU : dag.SUnits) {
+ if (!SU.isInstr())
+ continue;
+ if (SU.NumSuccs == 0) {
+ dagBottoms.insert(&SU);
+ } else {
+ MachineInstr *MI = SU.getInstr();
+ // Add SU which def value in Exp.outputLive.
+ for (MachineOperand &DefMO : MI->defs()) {
+ if (!DefMO.isReg())
+ continue;
+ unsigned Reg = DefMO.getReg();
+ if (Exp.BottomRegs.count(Reg) > 0) {
+ dagBottoms.insert(&SU);
+ break;
+ }
+ }
+ }
+ }
+
+ // For userBlocks which dominate all hotBlocks, don't need to clone because
+ // the value not cross hotBlocks when later blocks are cloned.
+ // For userBlocks which dominated by all hotBlocks, they could share clones
+ // because once after hot block, the pressure is OK.
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+ reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT);
+
+ // Sort to make stable order.
+ std::sort(userBlocks.begin(), userBlocks.end(),
+ [](std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it0,
+ std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it1) {
+ return it0.first->getNumber() < it1.first->getNumber();
+ });
+
+ const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
+
+ // Clone for each userBlocks. Not share clone thru dom tree which cannot help
+ // reg pressure.
+ for (auto it : userBlocks) {
+ MachineBasicBlock *MBB = it.first;
+ // Skip MBB which share clone from other MBBs.
+ if (userBlocksLiveRegs.count(MBB) == 0)
+ continue;
+ auto &usedOutput = userBlocksLiveRegs[MBB];
+ auto copySet = buildCloneSet(dag, dagBottoms, usedOutput);
+ // Clone to MBB.
+ // Create new regs first.
+ DenseMap<unsigned, unsigned> RegMap;
+ auto insertPtr = MBB->getFirstNonPHI();
+ // If Exp has scc read/write, make sure MBB not have scc in liveins.
+ if (bModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr))
+ continue;
+ MachineFunction *MF = MBB->getParent();
+ for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+ MachineInstr *DefMI = *it;
+ // Not clone if already in MBB.
+ if (DefMI->getParent() == MBB)
+ continue;
+ // Not clone if not used for MBB.
+ if (copySet.count(DefMI) == 0)
+ continue;
+
+ auto ClonedMI =
+ BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
+
+ for (MachineOperand &Def : DefMI->defs()) {
+ Register Reg = Def.getReg();
+ if (Reg.isPhysical()) {
+ if (Def.isImplicit())
+ continue;
+ ClonedMI.addDef(Reg, 0, Def.getSubReg());
+ } else {
+ unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ RegMap[Reg] = NewReg;
+ ClonedMI.addDef(NewReg, 0, Def.getSubReg());
+ }
+ }
+
+ for (MachineOperand &MO : DefMI->uses()) {
+ if (MO.isReg()) {
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical()) {
+ if (MO.isImplicit())
+ continue;
+ ClonedMI.addReg(Reg, 0, MO.getSubReg());
+ } else {
+ auto it = RegMap.find(Reg);
+ if (it == RegMap.end()) {
+ ClonedMI.addReg(Reg, 0, MO.getSubReg());
+ } else {
+ ClonedMI.addReg(it->second, 0, MO.getSubReg());
+ }
+ }
+ } else {
+ ClonedMI.add(MO);
+ }
+ }
+
+ MachineInstr *NewDef = ClonedMI.getInstr();
+ slotIndexes->insertMachineInstrInMaps(*NewDef);
+ // Set mem operand
+ for (MachineMemOperand *MO : DefMI->memoperands()) {
+ NewDef->addMemOperand(*MF, MO);
+ }
+ }
+
+ // update users in MBB.
+ SmallVector<MachineInstr *, 2> &userMIs = it.second;
+ updateUsers(userMIs, RegMap);
+
+ // update users in dom MBBs.
+ auto domMapIt = DomMap.find(MBB);
+ if (domMapIt != DomMap.end()) {
+ for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
+ SmallVector<MachineInstr *, 2> &userMIs = userBlocks[UpdateMBB];
+ updateUsers(userMIs, RegMap);
+ }
+ }
+ }
+}
+
+
+void ApplySubExpCloneNearUserInBlock(
+ SubExp &Exp,
+ DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+ DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+ MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI) {
+ MachineBasicBlock *MBB = Exp.FromBB;
+ MachineFunction *MF = MBB->getParent();
+ MachineInstr *hotVMI = inBlockHotVInstMap[MBB];
+ MachineInstr *hotSMI = inBlockHotSInstMap[MBB];
+ // Exp is build with hotVMI or hotSMI, cannot mix.
+ assert(!(hotVMI && hotSMI) && "cannot mix hot MI");
+ MachineInstr *hotMI = hotVMI;
+ if (!hotMI) {
+ hotMI = hotSMI;
+ }
+
+ SlotIndex hotSlot = slotIndexes->getInstructionIndex(*hotMI).getBaseIndex();
+ const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
+
+ for (unsigned Reg : Exp.BottomRegs) {
+
+ SmallVector<MachineInstr *, 2> useMIs;
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ MachineBasicBlock *UserBB = UseMI.getParent();
+ // Skip current BB.
+ if (UserBB != Exp.FromBB)
+ continue;
+ // Skip inst in Exp.
+ if (Exp.BottomRoots.find(&UseMI) != Exp.BottomRoots.end())
+ continue;
+ SlotIndex useSlot =
+ slotIndexes->getInstructionIndex(UseMI).getBaseIndex();
+ // Only clone for use after hot slot.
+ if (useSlot < hotSlot)
+ continue;
+
+ // Do not overwrite a live scc.
+ if (bModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI))
+ continue;
+
+ useMIs.emplace_back(&UseMI);
+ }
+ if (useMIs.empty())
+ continue;
+ DenseMap<unsigned, unsigned> RegMap;
+
+ std::sort(useMIs.begin(), useMIs.end(),
+ [&slotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) {
+ return slotIndexes->getInstructionIndex(*MIa).getBaseIndex() <
+ slotIndexes->getInstructionIndex(*MIb).getBaseIndex();
+ });
+ auto insertPtr = useMIs.front()->getIterator();
+
+ for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+ MachineInstr *DefMI = *it;
+ auto ClonedMI =
+ BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
+
+ for (MachineOperand &Def : DefMI->defs()) {
+ Register Reg = Def.getReg();
+ if (Reg.isPhysical()) {
+ ClonedMI.addDef(Reg, 0, Def.getSubReg());
+ } else {
+ unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ RegMap[Reg] = NewReg;
+ ClonedMI.addDef(NewReg, 0, Def.getSubReg());
+ }
+ }
+
+ for (MachineOperand &MO : DefMI->uses()) {
+ if (MO.isReg()) {
+ if (MO.isImplicit()) {
+ continue;
+ }
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical()) {
+ ClonedMI.addReg(Reg, 0, MO.getSubReg());
+ } else {
+ auto it = RegMap.find(Reg);
+ if (it == RegMap.end()) {
+ ClonedMI.addReg(Reg, 0, MO.getSubReg());
+ } else {
+ ClonedMI.addReg(it->second, 0, MO.getSubReg());
+ }
+ }
+ } else {
+ ClonedMI.add(MO);
+ }
+ }
+
+ MachineInstr *NewDef = ClonedMI.getInstr();
+ slotIndexes->insertMachineInstrInMaps(*NewDef);
+ // Set mem operand
+ for (MachineMemOperand *MO : DefMI->memoperands()) {
+ NewDef->addMemOperand(*MF, MO);
+ }
+ }
+ // TODO: only clone to cross hot range.
+ for (MachineInstr *UseMI : useMIs) {
+ for (MachineOperand &MO : UseMI->uses()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ auto it = RegMap.find(Reg);
+ if (it == RegMap.end())
+ continue;
+ unsigned NewReg = it->second;
+ MO.setReg(NewReg);
+ }
+ }
+ }
+}
+
+bool isInLiveSet(unsigned Reg, LaneBitmask mask,
+ const GCNRPTracker::LiveRegSet &live) {
+ auto it = live.find(Reg);
+ if (it == live.end())
+ return false;
+
+ LaneBitmask liveMask = it->second;
+ return (liveMask | mask) == liveMask;
+}
+
+unsigned getPacifistLevel(unsigned Reg,
+ DenseMap<MachineInstr *, unsigned> &pacifistLevels,
+ const MachineRegisterInfo &MRI) {
+ unsigned level = 0;
+ for (MachineInstr &MI : MRI.def_instructions(Reg)) {
+ auto it = pacifistLevels.find(&MI);
+ if (it == pacifistLevels.end())
+ continue;
+ level = it->second;
+ }
+ return level;
+}
+
+bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB,
+ const MachineRegisterInfo &MRI) {
+ for (MachineInstr &def : MRI.def_instructions(Reg)) {
+ if (def.getParent() != MBB)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+MachineInstr *getInBlockUniqueDef(unsigned Reg, MachineBasicBlock *MBB,
+ const GCNRPTracker::LiveRegSet &inputLive,
+ const GCNRPTracker::LiveRegSet &outputLive,
+ const MachineRegisterInfo &MRI) {
+ MachineInstr *DefMI = nullptr;
+ // If live as input for MBB, cannot be unique def.
+ if (inputLive.count(Reg))
+ return DefMI;
+ for (MachineInstr &def : MRI.def_instructions(Reg)) {
+ if (def.getParent() != MBB)
+ continue;
+ if (DefMI) {
+ // Not unique.
+ DefMI = nullptr;
+ break;
+ }
+ DefMI = &def;
+ }
+ return DefMI;
+}
+
+bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive,
+ const GCNRPTracker::LiveRegSet &outputLive) {
+ return inputLive.count(Reg) && outputLive.count(Reg);
+}
+
+// Instructions which only use imm/passThru reg/output only reg will not kill any
+// live reg, so name them pacifist here.
+bool collectPacifist(MachineInstr &MI,
+ const GCNRPTracker::LiveRegSet &inputLive,
+ const GCNRPTracker::LiveRegSet &outputLive,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
+ // If has implicit def, not move.
+ if (MI.getDesc().NumImplicitDefs != 0)
+ return false;
+
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+
+ Register Reg = MO.getReg();
+ if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+ continue;
+ if (Reg.isPhysical())
+ return false;
+ // The def for reg must be unique def in block or pass thru which not has
+ // def in block. If not, it is not safe to move.
+ if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), inputLive,
+ outputLive, MRI) ||
+ (isPassThru(Reg, inputLive, outputLive) &&
+ !hasInBlockDef(Reg, MI.getParent(), MRI))))
+ return false;
+
+ LaneBitmask mask = llvm::getRegMask(MO, MRI);
+
+ if (isInLiveSet(Reg, mask, outputLive))
+ continue;
+
+ return false;
+ }
+ bool bHasDef = false;
+ for (MachineOperand &MO : MI.defs()) {
+ Register Reg = MO.getReg();
+
+ if (Reg.isPhysical())
+ return false;
+
+ if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+ return false;
+
+ bHasDef = true;
+ }
+ // If no def, it will not increase pressure, don't mark it.
+ return bHasDef;
+}
+
+static MachineInstr* findFirstAliasingLoadOrStoreInMBB(
+ MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ AliasAnalysis *AA
+)
+{
+ if (MI.mayLoadOrStore())
+ {
+ for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); I != E; ++I)
+ {
+ const bool UseTBAA = false;
+ if (MI.mayAlias(AA, *I, UseTBAA))
+ {
+ return &*I;
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI,
+ AliasAnalysis *AA,
+ SlotIndexes *slotIndexes) {
+
+ SmallVector<MachineInstr *, 2> users;
+
+ // We cannot move the pacifist instruction past any memory
+ // op with which it aliases. Find the first instruction
+ // that aliases the pacifist MI (if any) and add it to the list
+ // of users. The sort() below will select the earliest user instruction.
+ if (MachineInstr* AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
+ users.push_back(AliasMI);
+ }
+
+ for (MachineOperand &MO : MI.defs()) {
+ unsigned Reg = MO.getReg();
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg))
+ {
+ if (&MBB != UseMI.getParent())
+ continue;
+ users.emplace_back(&UseMI);
+ }
+ }
+ if (users.empty())
+ return nullptr;
+
+ std::sort(users.begin(), users.end(),
+ [&slotIndexes](const MachineInstr *MIa, MachineInstr *MIb) {
+ // Early instr first.
+ return SlotIndex::isEarlierInstr(
+ slotIndexes->getInstructionIndex(*MIa),
+ slotIndexes->getInstructionIndex(*MIb));
+ });
+ return users.front();
+}
+
+// Pacifist inst will only add pressure since they don't kill.
+// Try to hold them as late as possible in a MBB to help pressure.
+bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, AliasAnalysis *AA,
+ RematStatus &status)
+{
+ const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
+ const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+
+ SmallVector<MachineInstr *, 32> pacifistList;
+ LLVM_DEBUG(dbgs() << "pacifist begin\n");
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ if (collectPacifist(MI, inputLive, outputLive, MRI, SIRI)) {
+ pacifistList.emplace_back(&MI);
+ LLVM_DEBUG(MI.dump());
+ }
+ }
+ LLVM_DEBUG(dbgs() << "pacifist end\n");
+
+ SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+ bool bUpdated = false;
+
+ // Move pacifist to its first user.
+ for (MachineInstr *MI : pacifistList) {
+ MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
+ if (firstUser == MI)
+ continue;
+ if (firstUser == MI->getNextNode())
+ continue;
+
+ auto insertPoint = MBB.getFirstInstrTerminator();
+ if (firstUser) {
+ insertPoint = firstUser->getIterator();
+ } else {
+ // When there's no terminator.
+ if (insertPoint == MBB.end())
+ insertPoint--;
+ else
+ // BRANCH may have exec update before it.
+ insertPoint--;
+
+ insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+
+ while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) ||
+ insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) &&
+ insertPoint != MI->getIterator())
+ {
+ insertPoint--;
+ insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+ }
+ if (insertPoint == MI->getIterator())
+ continue;
+ }
+ // Do not overwrite a live scc.
+ if (WillSmashSccAtLocation(MI, &MBB, insertPoint))
+ continue;
+ MI->removeFromParent();
+ MBB.insert(insertPoint, MI);
+
+ LIS->handleMove(*MI);
+ bUpdated = true;
+ }
+
+ return bUpdated;
+}
+
+DenseMap<unsigned, MachineInstr *>
+collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
+ DenseMap<unsigned, MachineInstr *> UniformMap;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ if (!Remat->TotalUniformInsts.count(&MI))
+ continue;
+ if (MI.getNumDefs() != 1)
+ continue;
+ unsigned dstIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst);
+ if (dstIdx == -1)
+ continue;
+ MachineOperand &DstMO = MI.getOperand(dstIdx);
+ if (DstMO.getSubReg() != 0)
+ continue;
+ if (DstMO.isTied())
+ continue;
+ unsigned Reg = DstMO.getReg();
+ if (MRI.getUniqueVRegDef(Reg) == nullptr)
+ continue;
+
+ auto *VRC = SIRI->getRegClassForReg(MRI, Reg);
+ if (SIRI->isSGPRClass(VRC))
+ continue;
+ // TODO: Support more reg class.
+ if (VRC != &AMDGPU::VGPR_32RegClass)
+ continue;
+
+ UniformMap[Reg] = &MI;
+ }
+ }
+ return UniformMap;
+}
+
+// Try insert readfirstlane on uniform vgpr to turn it in sgpr and save vgpr
+// pressure.
+bool collectVToSCrossHotSpot(
+ MachineBasicBlock &MBB, RematStatus &status,
+ DenseMap<unsigned, MachineInstr *> &UniformMap,
+ SmallMapVector<unsigned, MachineInstr *, 4> &VToSMap, LiveIntervals *LIS,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII) {
+ unsigned VLimit = status.TargetVLimit;
+ unsigned SLimit = status.TargetSLimit;
+ auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+
+ GCNDownwardRPTracker Tracker(*LIS);
+
+ bool bUpdated = false;
+ const auto inputLive = status.MBBInputLiveMap[&MBB];
+ Tracker.reset(*MBB.begin(), &inputLive);
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugInstr()) {
+ continue;
+ }
+
+ unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts());
+ unsigned SPressure = Tracker.getPressure().getMaxSGPR();
+
+ SPressure += RegForVCC;
+
+ Tracker.advance();
+ // Sgpr bound, vtos cannot help.
+ if (SPressure > SLimit)
+ return false;
+
+ if (VPressure <= VLimit) {
+ continue;
+ }
+
+ // Try to make all possible vtos to reduce vpressure.
+ int VExtra = VPressure - VLimit;
+
+ const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs();
+ for (auto it : CurLives) {
+ unsigned Reg = it.first;
+ auto UniformIt = UniformMap.find(Reg);
+ if (UniformIt == UniformMap.end())
+ continue;
+ VToSMap[UniformIt->first] = UniformIt->second;
+ VExtra--;
+ bUpdated = true;
+ }
+
+ }
+ return bUpdated;
+}
+
+// Return true if the user is outside of the def's loop.
+static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI)
+{
+ MachineLoop* L = MLI->getLoopFor(Def->getParent());
+ return L && !L->contains(User->getParent());
+}
+
+bool rematUniformVgprToSgpr(
+ Remat *Remat,
+ MachineFunction &MF, RematStatus &status,
+ DenseMap<MachineBasicBlock *, GCNRegPressure> &MBBPressureMap,
+ std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
+ DenseMap<unsigned, MachineInstr *> UniformVgprMap =
+ collectUniformVgprs(Remat, MF, MRI, SIRI);
+
+ SmallMapVector<unsigned, MachineInstr *, 4> VToSMap;
+
+ for (auto &hotBlock : hotBlocks) {
+ MachineBasicBlock &MBB = *hotBlock.MBB;
+ collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS, MRI,
+ SIRI, SIII);
+ }
+
+ if (VToSMap.empty())
+ return false;
+ SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+ const MCInstrDesc &ReadFirstLaneDesc = SIII->get(AMDGPU::V_READFIRSTLANE_B32);
+ for (auto it : VToSMap) {
+ unsigned Reg = it.first;
+ MachineInstr *MI = it.second;
+
+ auto *VRC = SIRI->getRegClassForReg(MRI, Reg);
+ // TODO: support bigger vgpr to sgpr.
+ if (VRC != &AMDGPU::VGPR_32RegClass)
+ continue;
+ auto *NewRC = SIRI->getEquivalentSGPRClass(VRC);
+ unsigned newDst = MRI.createVirtualRegister(NewRC);
+
+ auto ReadFirstLane =
+ BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, newDst);
+ SmallVector<MachineInstr *, 2> userMIs;
+ for (MachineInstr &userMI : MRI.use_nodbg_instructions(Reg)) {
+ // Do not replace v->s across loops. Even if the value is uniform
+ // branch divergence can cause a uniform value in a loop to be
+ // non-uniform when used outside a loop.
+ if (IsSafeRematCandidateUser(&userMI, SIII) && !IsCrossLoopUse(MI, &userMI, MLI))
+ userMIs.emplace_back(&userMI);
+ }
+
+ // Finish readfirstlane
+ ReadFirstLane.addReg(Reg);
+ MachineInstr *VToSMI = ReadFirstLane.getInstr();
+ Remat->TotalUniformInsts.insert(VToSMI);
+ Remat->SafeToRemoveInsts.insert(VToSMI);
+ MachineBasicBlock *MBB = MI->getParent();
+ MBB->insertAfter(MI->getIterator(), VToSMI);
+ slotIndexes->insertMachineInstrInMaps(*VToSMI);
+
+ for (MachineInstr *userMI : userMIs) {
+ const auto &Desc = userMI->getDesc();
+ bool bIllegal = false;
+ for (unsigned i=0;i<userMI->getNumOperands();i++) {
+ MachineOperand &MO = userMI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+ if (MO.getReg() != Reg)
+ continue;
+ if (i >= Desc.getNumOperands()) {
+ bIllegal = true;
+ break;
+ }
+
+ MO.setReg(newDst);
+ if (userMI->getDesc().operands()[i].RegClass != -1) {
+ if (!SIII->isOperandLegal(*userMI, i, &MO)) {
+ SIII->legalizeOperands(*userMI);
+ // In case legalizeOperands not help, just legalize with mov.
+ if (userMI->getDesc().operands()[i].RegClass != -1 &&
+ !SIII->isOperandLegal(*userMI, i)) {
+ SIII->legalizeOpWithMove(*userMI, i);
+ }
+ }
+ } else {
+ // consider not have limit on reg class.
+ }
+ }
+ if (bIllegal)
+ continue;
+
+ auto rit = userMI->getReverseIterator();
+ rit++;
+ auto endIt = userMI->getParent()->rend();
+ while (rit != endIt && !rit->isDebugInstr() && !slotIndexes->hasIndex(*rit))
+ slotIndexes->insertMachineInstrInMaps(*(rit++));
+ }
+ }
+
+ return true;
+}
+
+bool collectRematableHotReg(
+ MachineInstr &MI, const GCNRPTracker::LiveRegSet &hotLive,
+ GCNRPTracker::LiveRegSet &pureHotRematSet,
+ DenseMap<MachineInstr *, unsigned> &pureHotRematLevels, unsigned &DefReg,
+ const GCNRPTracker::LiveRegSet &inputLive,
+ const GCNRPTracker::LiveRegSet &outputLive, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
+ // Ignore inst not have def or more than 1 def.
+ if (MI.getDesc().getNumDefs() != 1)
+ return false;
+
+ DefReg = MI.defs().begin()->getReg();
+
+ unsigned level = 0;
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+
+ Register Reg = MO.getReg();
+
+ // If user is in same MI like
+ // %4:vgpr_32 = V_MAD_LEGACY_F32 %2:vgpr_32, %3:vgpr_32, %4:vgpr_32
+ // remat it will not help.
+ if (Reg == DefReg) {
+ return false;
+ }
+
+ if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+ continue;
+ if (Reg.isPhysical())
+ return false;
+
+ if (nullptr ==
+ getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+ return false;
+
+ LaneBitmask mask = llvm::getRegMask(MO, MRI);
+
+ if (isInLiveSet(Reg, mask, hotLive))
+ continue;
+
+ if (isInLiveSet(Reg, mask, pureHotRematSet)) {
+ unsigned regLevel = getPacifistLevel(Reg, pureHotRematLevels, MRI);
+ level = std::max(level, regLevel);
+ continue;
+ }
+
+ return false;
+ }
+
+ for (MachineOperand &MO : MI.defs()) {
+ Register Reg = MO.getReg();
+
+ if (Reg.isPhysical())
+ return false;
+
+ if (nullptr ==
+ getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+ return false;
+
+ LaneBitmask mask = llvm::getRegMask(MO, MRI);
+ pureHotRematSet[Reg] |= mask;
+ }
+
+ pureHotRematLevels[&MI] = level + 1;
+ // If no def, it will not increase pressure, don't mark it.
+ return true;
+}
+
+bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
+ std::vector<SubExp> &inBlockCloneSubExps, bool bVGPR,
+ const GCNRPTracker::LiveRegSet &inputLive,
+ const GCNRPTracker::LiveRegSet &outputLive,
+ DenseSet<MachineInstr *> &hotSet, int vDistance, int sDistance,
+ unsigned VLimit, unsigned SLimit,
+ const DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
+ LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII) {
+ auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex();
+ const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
+
+ GCNRPTracker::LiveRegSet hotLive = LISLR;
+
+ GCNRPTracker::LiveRegSet pureHotRematSet;
+ std::vector<MachineInstr *> pureHotRematList;
+ DenseMap<MachineInstr *, unsigned> pureHotRematLevels;
+
+ GCNRPTracker::LiveRegSet outputSet;
+ LLVM_DEBUG(dbgs() << "pure hot remat begin\n");
+ // Find reg which could remat from other reg in liveSet.
+ const unsigned kMaxRematLevel = 6;
+ GCNDownwardRPTracker Tracker(*LIS);
+ Tracker.reset(*MBB.begin(), &inputLive);
+ for (auto it = MBB.begin(); it != MBB.end(); it++) {
+ MachineInstr &MI = *it;
+ const GCNRegPressure &RP = Tracker.getPressure();
+
+ if (MI.isDebugInstr())
+ continue;
+
+ // Igonre inst in hot range.
+ if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || RP.getMaxSGPR() > SLimit) {
+ Tracker.advance();
+ continue;
+ }
+
+ // Stop at hotMI.
+ if (&MI == hotMI)
+ break;
+
+ Tracker.advance();
+
+ unsigned DefReg = 0;
+ if (collectRematableHotReg(MI, hotLive, pureHotRematSet, pureHotRematLevels,
+ DefReg, inputLive, outputLive, MRI, SIRI)) {
+ unsigned level = pureHotRematLevels[&MI];
+ if (level >= kMaxRematLevel)
+ continue;
+
+ // If the def reg is in hot reg.
+ // Add to output.
+ if (hotLive.find(DefReg) != hotLive.end()) {
+ bool bUserIsHot = false;
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) {
+ if (UseMI.getParent() != &MBB)
+ continue;
+ if (0 == hotSet.count(&UseMI))
+ continue;
+
+ const auto &useSI = LIS->getInstructionIndex(UseMI).getBaseIndex();
+ // When has a hot user after hotMI, remat it may not help.
+ if (useSI > SI) {
+ bUserIsHot = true;
+ break;
+ }
+ }
+
+ if (bUserIsHot)
+ continue;
+ outputSet[DefReg];
+ LLVM_DEBUG(dbgs() << "hotRemat:");
+ LLVM_DEBUG(MI.getOperand(0).dump());
+ // remove it from hotLive to avoid it as input when build dag.
+ hotLive.erase(DefReg);
+ }
+ pureHotRematList.emplace_back(&MI);
+ LLVM_DEBUG(dbgs() << "level:" << level);
+ LLVM_DEBUG(MI.dump());
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "pure hot remat end\n");
+
+ // Create input/output for pure hot remat.
+ // Input is things hot reg in level 1 and output is things level > 1.
+ // Build SubExp with pureHotRematList as Nodes, hotLive as input
+ // rematHot as output.
+ // Not join input when build ExpDag to get small subExps.
+ ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ false);
+ dag.build(hotLive, outputSet, pureHotRematList);
+ // Find best subExp add to inBlockCloneSubExps.
+ // Sort by size of subExp.
+ std::sort(dag.SubExps.begin(), dag.SubExps.end(),
+ [](const SubExp &a, const SubExp &b) {
+ return a.SUnits.size() < b.SUnits.size();
+ });
+ std::vector<SubExp> cloneSubExps;
+ int distance = bVGPR ? vDistance : sDistance;
+ for (SubExp &subExp : dag.SubExps) {
+ if (subExp.bNotSafeToCopy)
+ continue;
+ if (bVGPR) {
+ if (subExp.vOutputSize == 0)
+ continue;
+ } else {
+ if (subExp.sOutputSize == 0)
+ continue;
+ }
+ if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
+ continue;
+ // Not clone big subExp.
+ if (subExp.SUnits.size() > 10)
+ continue;
+ // Do not allow remat in the block when the expression has a memory op and
+ // the block has a write. We could allow this in some cases with better
+ // analysis.
+ if (subExp.bHasMemInst && MemWriteMBBSet.count(&MBB))
+ continue;
+ if (bVGPR) {
+ distance -= subExp.vOutputSize;
+ } else {
+ distance -= subExp.sOutputSize;
+ }
+ cloneSubExps.emplace_back(subExp);
+ if (distance <= 0)
+ break;
+ }
+ if (distance <= 0) {
+ inBlockCloneSubExps.insert(inBlockCloneSubExps.end(), cloneSubExps.begin(),
+ cloneSubExps.end());
+ }
+ return distance <= 0;
+}
+
+// Try to remat live reg in hot spot from other live reg in hot spot.
+//
+bool tryRematInHotSpot(
+ MachineBasicBlock &MBB, RematStatus &status, int vDistance, int sDistance,
+ int vSaved, int sSaved, std::vector<SubExp> &inBlockCloneSubExps,
+ DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+ DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+ LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ unsigned VLimit = status.TargetVLimit;
+ unsigned SLimit = status.TargetSLimit;
+
+ auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
+
+ const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+
+ // Collect reg pressure.
+ unsigned maxLocalVPressure = 0;
+ unsigned maxLocalSPressure = 0;
+ // Build a DAG or only on demand?
+ MachineInstr *hotVMI = nullptr;
+ MachineInstr *hotSMI = nullptr;
+ DenseSet<MachineInstr *> hotSet;
+
+ GCNDownwardRPTracker Tracker(*LIS);
+
+ Tracker.reset(*MBB.begin(), &inputLive);
+ for (auto it = MBB.begin(); it != MBB.end(); it++) {
+ MachineInstr &MI = *it;
+ if (MI.isDebugInstr()) {
+ continue;
+ }
+
+ unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts());
+ unsigned SPressure = Tracker.getPressure().getMaxSGPR();
+
+ SPressure += RegForVCC;
+
+ VPressure -= vSaved;
+ SPressure -= sSaved;
+ Tracker.advance();
+
+ if (VPressure <= VLimit && SPressure <= SLimit) {
+ continue;
+ }
+ hotSet.insert(&MI);
+ if (maxLocalVPressure < VPressure) {
+ maxLocalVPressure = VPressure;
+ hotVMI = &MI;
+ }
+ if (maxLocalSPressure < SPressure) {
+ maxLocalSPressure = SPressure;
+ hotSMI = &MI;
+ }
+ }
+
+ inBlockHotVInstMap[&MBB] = hotVMI;
+ inBlockHotSInstMap[&MBB] = hotSMI;
+ if (vDistance > 0 && hotVMI) {
+ // Use hotVMI when apply.
+ inBlockHotSInstMap[&MBB] = nullptr;
+ if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive,
+ outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
+ status.MemWriteMBBSet,
+ LIS, MRI, SIRI, SIII))
+ return true;
+ }
+
+ if (sDistance > 0 && hotSMI) {
+ // Use hotSMI when apply.
+ inBlockHotSInstMap[&MBB] = hotSMI;
+ inBlockHotVInstMap[&MBB] = nullptr;
+ return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false,
+ inputLive, outputLive, hotSet, vDistance, sDistance, VLimit,
+ SLimit, status.MemWriteMBBSet,
+ LIS, MRI, SIRI, SIII);
+ }
+ return false;
+}
+// Sort subExpCandidates to make sure deeper subExp apply first.
+// If subExp0 use result of subExp1, subExp0 is deeper than subExp1.
+// When apply subExp1 before subExp0, new clone of subExp0 which use result of
+// subExp1 will have old reg of subExp1. And reg pressure will not be reduced.
+void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
+ MapVector<unsigned, SetVector<SubExp *>> inputMap;
+ MapVector<unsigned, SetVector<SubExp *>> outputMap;
+ struct SortNode {
+ SubExp Exp;
+ unsigned Depth;
+ bool bDepthDirty;
+ SmallDenseSet<SubExp *, 2> Preds;
+ SmallDenseSet<SubExp *, 2> Succs;
+ };
+
+ {
+ SmallVector<unsigned, 10> RegSortStorage;
+ for (SubExp &Exp : subExpCandidates) {
+ RegSortStorage.assign(Exp.TopRegs.begin(), Exp.TopRegs.end());
+ std::sort(RegSortStorage.begin(), RegSortStorage.end());
+ for (auto it : RegSortStorage) {
+ unsigned Reg = it;
+ inputMap[Reg].insert(&Exp);
+ }
+
+ RegSortStorage.assign(Exp.BottomRegs.begin(), Exp.BottomRegs.end());
+ std::sort(RegSortStorage.begin(), RegSortStorage.end());
+ for (auto it : RegSortStorage) {
+ unsigned Reg = it;
+ outputMap[Reg].insert(&Exp);
+ }
+ }
+ }
+
+ MapVector<SubExp *, SortNode> sortMap;
+ for (auto it : inputMap) {
+ unsigned Reg = it.first;
+ auto outIt = outputMap.find(Reg);
+ if (outIt == outputMap.end())
+ continue;
+ auto &inExps = it.second;
+ auto &outExps = outIt->second;
+ for (SubExp *inExp : inExps) {
+ for (SubExp *outExp : outExps) {
+ if (inExp->bHoist != outExp->bHoist) {
+ // Different direction.
+ // If output (def) move up, input(use) move down, nothing happens.
+ if (outExp->bHoist)
+ continue;
+ // Canot input(use) move up, output(def) move down.
+ // Choose the exp which save more.
+ int inExpGain = inExp->vOutputSize - inExp->vInputSize;
+ int outExpGain = outExp->vInputSize - inExp->vOutputSize;
+ if (inExpGain >= outExpGain) {
+ outExp->SUnits.clear();
+ } else {
+ inExp->SUnits.clear();
+ }
+ continue;
+ }
+ // Link outExp to inExp.
+ if (inExp->bHoist) {
+ sortMap[outExp].Preds.insert(inExp);
+ sortMap[inExp].Succs.insert(outExp);
+ } else {
+ sortMap[inExp].Preds.insert(outExp);
+ sortMap[outExp].Succs.insert(inExp);
+ }
+ }
+ }
+ }
+
+ if (sortMap.empty())
+ return;
+
+ SmallVector<SubExp *, 8> WorkList;
+ for (SubExp &Exp : subExpCandidates) {
+ SortNode &Node = sortMap[&Exp];
+ Node.Depth = 0;
+ Node.Exp = Exp;
+ Node.bDepthDirty = !Node.Preds.empty();
+ if (!Node.bDepthDirty)
+ WorkList.emplace_back(&Exp);
+ }
+ // Calc depth.
+ while (!WorkList.empty()) {
+ SubExp *Exp = WorkList.pop_back_val();
+ SortNode &Node = sortMap[Exp];
+ for (SubExp *Succ : Node.Succs) {
+ SortNode &SuccNode = sortMap[Succ];
+ SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1);
+ bool bAllPrevClean = true;
+ for (SubExp *Prev : SuccNode.Preds) {
+ SortNode &PrevNode = sortMap[Prev];
+ if (PrevNode.bDepthDirty) {
+ bAllPrevClean = false;
+ break;
+ }
+ }
+ if (bAllPrevClean) {
+ SuccNode.bDepthDirty = false;
+ WorkList.push_back(Succ);
+ }
+ }
+ }
+
+ std::vector<SortNode *> nodes;
+ for (auto &it : sortMap) {
+ SortNode &node = it.second;
+ nodes.emplace_back(&node);
+ }
+
+ struct sorter {
+ bool operator()(const SortNode *a, const SortNode *b) {
+ return a->Depth > b->Depth;
+ }
+ };
+
+ // subExp deeper should be apply first.
+ std::sort(nodes.begin(), nodes.end(), sorter());
+
+ subExpCandidates.clear();
+ for (auto &node : nodes) {
+ subExpCandidates.emplace_back(node->Exp);
+ }
+}
+
+// Compare pressure, return ture if maxV0/maxS0 pressure is higher than maxV1/maxS1.
+bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
+ unsigned maxS1, const GCNSubtarget *ST) {
+ unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0);
+ unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(maxV1);
+ unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(maxS0);
+ unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(maxS1);
+ unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0);
+ unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1);
+ // big occupancy is low pressure.
+ if (Occ0 > Occ1)
+ return false;
+ if (Occ0 < Occ1)
+ return true;
+ // When sgpr bound, big sgpr is high pressure.
+ if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) {
+ return maxS0 > maxS1;
+ }
+ // When vgpr bound or mix, vgpr higher is higher pressure.
+ return maxV0 > maxV1;
+}
+
+// Return true if the subExp can help pressure for passThrus.
+bool canHelpPressureWhenSink(SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+ MachineDominatorTree *pDT, bool bCanClone,bool bSgprBound) {
+ LLVM_DEBUG(subExp.dump(MRI, SIRI));
+ if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
+ return false;
+
+ // Update input size to ignore lives in which already in
+ // passThrus.
+ for (auto it : subExp.inputLive) {
+ unsigned Reg = it.first;
+ if (passThrus.count(Reg) == 0)
+ continue;
+ unsigned Size = getRegSize(Reg, it.second, MRI, SIRI);
+ if (SIRI->isVGPR(MRI, Reg)) {
+ subExp.vInputSize -= Size;
+ } else {
+ subExp.sInputSize -= Size;
+ }
+ }
+
+ if (subExp.vInputSize > subExp.vOutputSize)
+ return false;
+
+ if (subExp.sInputSize > subExp.sOutputSize && bSgprBound)
+ return false;
+
+ if (subExp.sInputSize >= subExp.sOutputSize &&
+ subExp.vInputSize == subExp.vOutputSize)
+ return false;
+
+ // Try to find a Insert Block.
+ // Skip multi def output sub exp.
+ // Collect user blocks, find common dom.
+ BlockSet userBlocks;
+ for (unsigned Reg : subExp.BottomRegs) {
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ MachineBasicBlock *UserBB = UseMI.getParent();
+ // Skip current BB.
+ if (UserBB != subExp.FromBB)
+ userBlocks.insert(UserBB);
+ }
+ }
+ if (userBlocks.empty())
+ return false;
+ MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks);
+ if (!pDT->dominates(subExp.FromBB, userBlock)) {
+ return false;
+ }
+ if (userBlock == subExp.FromBB &&
+ // When allow clone, could go clone path if cannot move subExp.
+ !bCanClone)
+ return false;
+
+ subExp.ToBB = userBlock;
+ if (auto *toLoop = MLI->getLoopFor(userBlock)) {
+ auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
+ if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
+ subExp.bMoveIntoLoop = true;
+ } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
+ auto *toLoop = MLI->getLoopFor(userBlock);
+ // not safe to move out of loop.
+ if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() ||
+ toLoop != fromLoop)
+ return false;
+ }
+ return true;
+}
+
+bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII,
+ const MachineLoopInfo *MLI, bool bSgprBound) {
+ if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ true))
+ return false;
+ if (subExp.vInputSize < subExp.vOutputSize)
+ return false;
+ if (subExp.sInputSize < subExp.sOutputSize && bSgprBound)
+ return false;
+
+ if (subExp.sInputSize <= subExp.sOutputSize &&
+ subExp.vInputSize == subExp.vOutputSize)
+ return false;
+
+ // Try to find a Insert Block.
+ // Skip multi def output sub exp.
+ // Collect user blocks, find common dom.
+ BlockSet defBlocks;
+ for (unsigned Reg : subExp.TopRegs) {
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ if (!DefMI)
+ continue;
+ defBlocks.insert(DefMI->getParent());
+ }
+ if (defBlocks.size() != 1)
+ return false;
+ MachineBasicBlock *defBlock = *defBlocks.begin();
+ subExp.ToBB = defBlock;
+ // Not do same block hoist.
+ if (subExp.ToBB == subExp.FromBB)
+ return false;
+
+ if (auto *toLoop = MLI->getLoopFor(defBlock)) {
+ auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
+ // TODO: enable move into loop when hoist.
+ if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
+ return false;
+ } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
+ auto *toLoop = MLI->getLoopFor(defBlock);
+ // not safe to move out of loop.
+ if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() ||
+ toLoop != fromLoop)
+ return false;
+ }
+ return true;
+}
+
+SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+groupPassThruByDefBlock(Remat *Remat,
+ const GCNRPTracker::LiveRegSet &passThrus,
+ GCNRPTracker::LiveRegSet &usedPassThrus,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII) {
+ MapVector<MachineBasicBlock *, GCNRPTracker::LiveRegSet> Candidates;
+
+ // Group safe candidates by define block.
+ for (auto it : passThrus) {
+ unsigned Reg = it.first;
+ // Skip used pass thru reg to avoid count it twice for different hot block.
+ if (usedPassThrus.count(Reg))
+ continue;
+ LLVM_DEBUG(print_vreg(Reg, MRI));
+ LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr ";
+ else dbgs() << " vgpr ";);
+ if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true)) {
+ LLVM_DEBUG(dbgs() << " is not safe\n");
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << " is safe\n");
+ // DefMI is already checked in isSafeCandidate.
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+
+ GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()];
+ DefInMBB[Reg] = it.second;
+ }
+
+ llvm::SmallVector<std::pair<MachineBasicBlock*, GCNRPTracker::LiveRegSet>> result = Candidates.takeVector();
+
+ LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it
+ : result) {
+ MachineBasicBlock *MBB = it.first;
+ auto &defInMBB = it.second;
+ MBB->dump();
+ llvm::dumpLiveSet(defInMBB, SIRI);
+ } llvm::dbgs() << "end of candidates\n";);
+
+ std::sort(result.begin(), result.end(),
+ [](std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it0,
+ std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it1) {
+ return it0.first->getNumber() < it1.first->getNumber();
+ });
+
+ LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it
+ : result) {
+ MachineBasicBlock *MBB = it.first;
+ auto &defInMBB = it.second;
+ MBB->dump();
+ llvm::dumpLiveSet(defInMBB, SIRI);
+ } llvm::dbgs() << "end of candidates\n";);
+
+ return result;
+}
+
+// collect pass thru regs of MBB.
+GCNRPTracker::LiveRegSet
+collectPassThrus(MachineBasicBlock *MBB,
+ const GCNRPTracker::LiveRegSet &inputLive,
+ const GCNRPTracker::LiveRegSet &outputLive,
+ const GCNRPTracker::LiveRegSet &usedPassThrus,
+ const GCNRPTracker::LiveRegSet &liveRegCandidates,
+ MachineRegisterInfo &MRI, bool bCanClone) {
+ GCNRPTracker::LiveRegSet passThrus;
+ llvm::mergeLiveRegSet(passThrus, inputLive);
+ llvm::andLiveRegSet(passThrus, outputLive);
+
+ // Remove reg which not in liveRegCandidates.
+ GCNRPTracker::LiveRegSet tmpPassThrus = passThrus;
+ for (auto it : tmpPassThrus) {
+ unsigned Reg = it.first;
+ if (!liveRegCandidates.count(Reg)) {
+ passThrus.erase(Reg);
+ }
+ }
+ tmpPassThrus = passThrus;
+ // Remove reg which has read/write in MBB.
+ for (auto it : tmpPassThrus) {
+ unsigned Reg = it.first;
+ DenseSet<MachineBasicBlock *> DefMBBs;
+ for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+ MachineBasicBlock *MBB = DefMI.getParent();
+ DefMBBs.insert(MBB);
+ }
+ DenseSet<MachineBasicBlock *> UseMBBs;
+ // Allow use for pass thru if clone is OK.
+ if (!bCanClone) {
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ MachineBasicBlock *UserMBB = UseMI.getParent();
+ UseMBBs.insert(UserMBB);
+ }
+ }
+ bool bW = DefMBBs.count(MBB) > 0;
+ bool bR = UseMBBs.count(MBB) > 0;
+
+ bool bPassThru = !bW && !bR;
+ if (!bPassThru)
+ passThrus.erase(Reg);
+ }
+ return passThrus;
+}
+// Try to build a free subExp which all input is passThrus.
+SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, GCNRPTracker::LiveRegSet &passThrus,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+ SubExp freeExp;
+ // Try to split the subExp to find a help case.
+ // Scan all inst in subExp, propagate free inst which input is from
+ // passThrus.
+ SmallDenseSet<unsigned, 4> freeRegs;
+ SmallDenseSet<unsigned, 8> freeInstUseRegs;
+ SmallVector<MachineInstr *, 4> freeInsts;
+ for (MachineInstr *MI : subExp.SUnits) {
+ bool bIsFree = true;
+ // Check all use regs are free.
+ for (MachineOperand &MO : MI->uses()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (MO.isImplicit() && Reg == AMDGPU::EXEC)
+ continue;
+ if (MRI.getUniqueVRegDef(Reg) == nullptr) {
+ bIsFree = false;
+ break;
+ }
+ // Skip local pass thrus unless it is free.
+ if (passThrus.count(Reg) && subExp.TopRegs.count(Reg))
+ continue;
+ if (freeRegs.count(Reg))
+ continue;
+ bIsFree = false;
+ break;
+ }
+ // Check def is unique.
+ for (MachineOperand &MO : MI->defs()) {
+ unsigned Reg = MO.getReg();
+ if (MRI.getUniqueVRegDef(Reg) == nullptr) {
+ bIsFree = false;
+ break;
+ }
+ }
+ if (!bIsFree)
+ continue;
+ // Save inst as free inst.
+ freeInsts.emplace_back(MI);
+ // Save def as free reg.
+ for (MachineOperand &MO : MI->defs()) {
+ unsigned Reg = MO.getReg();
+ freeRegs.insert(Reg);
+ }
+ // Save use regs as free use reg.
+ for (MachineOperand &MO : MI->uses()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+
+ freeInstUseRegs.insert(Reg);
+ }
+ }
+ // Then remove local inst has no output use.
+ for (MachineInstr *MI : freeInsts) {
+ bool bIsFreeUsed = false;
+ for (MachineOperand &MO : MI->defs()) {
+ unsigned Reg = MO.getReg();
+ // Used as freeInst or output.
+ bIsFreeUsed |=
+ freeInstUseRegs.count(Reg) > 0 || subExp.BottomRegs.count(Reg);
+ }
+ if (!bIsFreeUsed)
+ continue;
+ freeExp.SUnits.emplace_back(MI);
+ }
+ if (freeExp.SUnits.empty()) {
+ // mark has terminator to make it unsafe.
+ freeExp.bHasTerminatorInst = true;
+ return freeExp;
+ }
+ // Build BottomRegs and TopRegs for freeExp.
+ // BottomRegs is freeRegs in subExp.BottomRegs.
+ for (unsigned freeReg : freeRegs) {
+ if (subExp.BottomRegs.count(freeReg))
+ freeExp.BottomRegs.insert(freeReg);
+ }
+ // TopRegs is freeInstUseRegs in subExp.TopRegs.
+ for (unsigned freeInstUseReg : freeInstUseRegs) {
+ if (subExp.TopRegs.count(freeInstUseReg))
+ freeExp.TopRegs.insert(freeInstUseReg);
+ }
+ freeExp.FromBB = subExp.FromBB;
+ freeExp.ToBB = subExp.ToBB;
+ // must be clone since is partial of subExp.
+ freeExp.bCloneOnly = true;
+
+ // Calc reg for freeExp.
+ for (unsigned Reg : freeExp.TopRegs) {
+ freeExp.inputLive[Reg];
+ }
+
+ for (unsigned Reg : freeExp.BottomRegs) {
+ freeExp.outputLive[Reg];
+ }
+
+ CollectLiveSetPressure(freeExp.inputLive, MRI, SIRI, freeExp.vInputSize,
+ freeExp.sInputSize);
+ CollectLiveSetPressure(freeExp.outputLive, MRI, SIRI, freeExp.vOutputSize,
+ freeExp.sOutputSize);
+ return freeExp;
+}
+
+std::vector<SubExp> buildSubExpCandidates(
+ Remat *Remat,
+ SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+ &Candidates,
+ GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ const MachineLoopInfo *MLI, SlotIndexes *slotIndexes,
+ MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound,
+ GCNRPTracker::LiveRegSet &unUsedPassThrus,
+ DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
+ bool bAllowPartialUseInSubExp) {
+ std::vector<SubExp> subExpCandidates;
+ // Build exp dag on define blocks.
+ // Save profit candidates into list.
+ for (auto &it : Candidates) {
+ MachineBasicBlock *DefMBB = it.first;
+ // Try to remove out reg def sub exp from DefMBB.
+ GCNRPTracker::LiveRegSet &DefInMBB = it.second;
+ // Go up on the dag until reach share node.
+ auto subExps =
+ buildSubExpFromCandidates(Remat, DefInMBB, DefMBB, SIRI, SIII, MRI,
+ slotIndexes, unUsedPassThrus, bAllowPartialUseInSubExp);
+ for (SubExp &subExp : subExps) {
+ if (subExp.bHasMemInst) {
+ // Skip when memory ld/st inst need to cross MBB which write memory.
+ // TODO: check all MBBs in between FromBB and ToBB not write memory.
+ // Currently just skip when any memory write exist.
+ if (!MemWriteMBBSet.empty()) {
+ MachineBasicBlock *FromBB = subExp.FromBB;
+ MachineBasicBlock *ToBB = subExp.ToBB;
+ if (subExp.bHoist) {
+ FromBB = subExp.ToBB;
+ ToBB = subExp.FromBB;
+ }
+ bool bCrossMemWriteMBB = false;
+ for (MachineBasicBlock *MemMBB : MemWriteMBBSet) {
+ if (pDT->dominates(ToBB, MemMBB))
+ continue;
+ if (pDT->dominates(MemMBB, FromBB))
+ continue;
+ bCrossMemWriteMBB = true;
+ break;
+ }
+ if (bCrossMemWriteMBB)
+ continue;
+ }
+ }
+ if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
+ bCanClone, bSgprBound)) {
+ if (bAllowPartialUseInSubExp && subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
+ SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
+ if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
+ bCanClone, bSgprBound)) {
+ subExpCandidates.emplace_back(freeSubExp);
+ }
+ }
+ continue;
+ }
+
+ subExpCandidates.emplace_back(subExp);
+ }
+ }
+ return subExpCandidates;
+}
+
+std::pair<int, int>
+calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
+ GCNRPTracker::LiveRegSet &inputLive,
+ GCNRPTracker::LiveRegSet &outputLive, bool bVOutBound,
+ bool bSOutBound, bool bCanClone, MachineDominatorTree *pDT,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+ int vgpr = 0;
+ int sgpr = 0;
+ MachineBasicBlock *MBB = hotBB.MBB;
+ // Sink saving.
+ for (SubExp &Exp : subExpCandidates) {
+ if (Exp.bHoist) {
+ // ToMBB -> MBB -> FromMBB.
+ // If ToMBB not dom hot block, reg will not live in MBB.
+ if (!pDT->dominates(Exp.ToBB, MBB))
+ continue;
+ } else {
+ // If FromBB not dom hot block, reg will not live in MBB.
+ if (!pDT->dominates(Exp.FromBB, MBB))
+ continue;
+ // When subExp is from hotBB, check output instead of input.
+ if (Exp.FromBB == MBB) {
+ if (bVOutBound && Exp.vOutputSize < Exp.vInputSize)
+ continue;
+ if (bSOutBound && Exp.sOutputSize < Exp.sInputSize)
+ continue;
+ vgpr += Exp.vInputSize;
+ vgpr -= Exp.vOutputSize;
+ sgpr += Exp.sInputSize;
+ sgpr -= Exp.sOutputSize;
+ continue;
+ }
+ }
+ int vgprDiff = 0;
+ int sgprDiff = 0;
+ MachineBasicBlock *ToMBB = Exp.ToBB;
+ // If subExp is to hotBB, it is crossing output instead of input.
+ GCNRPTracker::LiveRegSet &crossLive = MBB == ToMBB ? outputLive : inputLive;
+
+ bool bClone = false;
+ GCNRPTracker::LiveRegSet newInput;
+ if (!Exp.bMoveIntoLoop) {
+ if (Exp.bHoist) {
+ // If FromBB dom hot block, it will not change live for MBB.
+ if (Exp.FromBB != MBB && pDT->dominates(Exp.FromBB, MBB))
+ continue;
+ } else {
+ // If ToBB dom hot block, it will not change live for MBB.
+ if (ToMBB != MBB && pDT->dominates(ToMBB, MBB)) {
+ if (bCanClone && !Exp.bNotSafeToCopy) {
+ bClone = true;
+ } else {
+ continue;
+ }
+ }
+ }
+
+ for (auto outIt : Exp.outputLive) {
+ unsigned Reg = outIt.first;
+ LaneBitmask outMask = outIt.second;
+ LaneBitmask MBBBeginMask;
+ if (crossLive.find(Reg) != crossLive.end())
+ MBBBeginMask = crossLive[Reg];
+ // Check mask which live in both BeginSlot and exp output when sink to
+ // kill the output. Check mask which not live in BeginSlot but live in
+ // exp output when hoist to live the output.
+ LaneBitmask profitMask =
+ Exp.bHoist ? (outMask & (~MBBBeginMask)) : (outMask & MBBBeginMask);
+ if (MBBBeginMask.any()) {
+ unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+ LLVM_DEBUG(std::string movStr =
+ Exp.bHoist ? "output hoist:" : "output sink:";
+ dbgs() << movStr << Register::virtReg2Index(Reg)
+ << " " << Size);
+ // Exp out live at block input.
+ // It will descrease live for MBB when sink and increase when hoist.
+ if (SIRI->isVGPR(MRI, Reg)) {
+ LLVM_DEBUG(dbgs() << "v\n");
+ if (Exp.bHoist)
+ vgprDiff += Size;
+ else
+ vgprDiff -= Size;
+ } else {
+ LLVM_DEBUG(dbgs() << "s\n");
+ if (Exp.bHoist)
+ sgprDiff += Size;
+ else
+ sgprDiff -= Size;
+ }
+ }
+ }
+
+ for (auto inIt : Exp.inputLive) {
+ unsigned Reg = inIt.first;
+ LaneBitmask inMask = inIt.second;
+ LaneBitmask MBBBeginMask;
+ if (crossLive.find(Reg) != crossLive.end())
+ MBBBeginMask = crossLive[Reg];
+ // Check mask which not live in BeginSlot but live in exp input when
+ // sink to live the input. Check mask which live in both BeginSlot and
+ // exp output when hoist to kill the input.
+ LaneBitmask profitMask =
+ Exp.bHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask));
+ if (profitMask.any()) {
+ // Update input live to avoid count same input more than once.
+ newInput[Reg] |= inMask;
+ // Exp in not live at block input.
+ // It will increase live for MBB.
+ unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+
+ LLVM_DEBUG(std::string movStr =
+ Exp.bHoist ? "input hoist:" : "input sink:";
+ dbgs() << movStr << Register::virtReg2Index(Reg)
+ << " " << Size);
+ if (SIRI->isVGPR(MRI, Reg)) {
+ LLVM_DEBUG(dbgs() << "v\n");
+ if (Exp.bHoist)
+ vgprDiff -= Size;
+ else
+ vgprDiff += Size;
+ } else {
+ LLVM_DEBUG(dbgs() << "s\n");
+ if (Exp.bHoist)
+ sgprDiff -= Size;
+ else
+ sgprDiff += Size;
+ }
+ }
+ }
+ } else {
+ // When sink into loop, the input will live for every block inside loop.
+ // The output will only lived between to blocks and the use blocks.
+ // If MBB dominate any user of output live reg, it will still live in
+ // MBB. So cannot count that output live reg as profit.
+ // Hoist into loop is not supported now.
+ for (auto outIt : Exp.outputLive) {
+ unsigned Reg = outIt.first;
+ bool bDomUser = false;
+ for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
+ MachineBasicBlock *UserMBB = MI.getParent();
+ if (pDT->dominates(MBB, UserMBB)) {
+ bDomUser = true;
+ break;
+ }
+ }
+ if (bDomUser)
+ continue;
+
+ LaneBitmask outMask = outIt.second;
+ LaneBitmask MBBBeginMask;
+ if (inputLive.find(Reg) != inputLive.end())
+ MBBBeginMask = inputLive[Reg];
+ LaneBitmask profitMask = outMask & MBBBeginMask;
+ if (MBBBeginMask.any()) {
+ unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+ LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg)
+ << " " << Size);
+ // Exp out live at block input.
+ // It will descrease live for MBB.
+ if (SIRI->isVGPR(MRI, Reg)) {
+ LLVM_DEBUG(dbgs() << "v\n");
+ vgprDiff -= Size;
+ } else {
+ LLVM_DEBUG(dbgs() << "s\n");
+ sgprDiff -= Size;
+ }
+ }
+ }
+
+ for (auto inIt : Exp.inputLive) {
+ unsigned Reg = inIt.first;
+ LaneBitmask inMask = inIt.second;
+ LaneBitmask MBBBeginMask;
+ if (inputLive.find(Reg) != inputLive.end())
+ MBBBeginMask = inputLive[Reg];
+ // Check mask which not live in BeginSlot but live in exp input.
+ LaneBitmask profitMask = inMask & (~MBBBeginMask);
+ if (profitMask.any()) {
+ // Update input live to avoid count same input more than once.
+ newInput[Reg] |= inMask;
+ // Exp in not live at block input.
+ // It will increase live for MBB.
+ unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+
+ LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg)
+ << " " << Size);
+ if (SIRI->isVGPR(MRI, Reg)) {
+ LLVM_DEBUG(dbgs() << "v\n");
+ vgprDiff += Size;
+ } else {
+ LLVM_DEBUG(dbgs() << "s\n");
+ sgprDiff += Size;
+ }
+ }
+ }
+ }
+
+ if (bVOutBound && vgprDiff > 0)
+ continue;
+
+ if (bSOutBound && sgprDiff > 0)
+ continue;
+ llvm::mergeLiveRegSet(crossLive, newInput);
+ vgpr += vgprDiff;
+ sgpr += sgprDiff;
+ if (bClone)
+ Exp.bCloneOnly = true;
+ }
+
+ return std::make_pair(vgpr, sgpr);
+}
+
+void addExpCandidates(std::vector<SubExp> &subExpCandidates,
+ std::vector<SubExp> &subExps,
+ GCNRPTracker::LiveRegSet &usedRegs) {
+ subExpCandidates.insert(subExpCandidates.end(), subExps.begin(),
+ subExps.end());
+ for (auto &Exp : subExps) {
+ if (Exp.bHoist) {
+ for (auto &Reg : Exp.TopRegs) {
+ usedRegs[Reg];
+ }
+ } else {
+ for (auto &Reg : Exp.BottomRegs) {
+ usedRegs[Reg];
+ }
+ }
+ }
+}
+
+bool tryToAddSubExps(
+ Remat *Remat,
+ HotBlock &hotBB, RematStatus &status, std::vector<SubExp> &subExpCandidates,
+ std::vector<SubExp> &inBlockCloneSubExps,
+ DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+ DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+ SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+ Candidates,
+ int vgpr, int sgpr, const GCNRPTracker::LiveRegSet &savingInputLive,
+ const GCNRPTracker::LiveRegSet &savingOutputLive,
+ GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedRegs,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+ SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT,
+ bool bCanClone, bool bVOutBound, bool bSOutBound,
+ GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
+ std::vector<SubExp> partialSubExps = buildSubExpCandidates(Remat,
+ Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, bCanClone,
+ bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
+ bAllowPartialUseInSubExp);
+
+ GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive;
+ GCNRPTracker::LiveRegSet tmpSavingOutputLive = savingOutputLive;
+ std::pair<int, int> curSaving = calculateSaving(
+ hotBB, partialSubExps, tmpSavingInputLive, tmpSavingOutputLive,
+ bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+ const int VLimit = status.TargetVLimit;
+ const int SLimit = status.TargetSLimit;
+
+ vgpr += curSaving.first;
+ sgpr += curSaving.second;
+
+ if (vgpr <= VLimit && sgpr <= SLimit) {
+ // nrmSubExps can help reach target occupancy, add it to
+ // subExpCandidates.
+ addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+ return true;
+ }
+
+ if (EnableSubExpAggressive) {
+ // Build candidates from passThrus but not used in partialSubExps.
+ GCNRPTracker::LiveRegSet sinkUsedRegs;
+ for (auto &Exp : partialSubExps) {
+ for (auto &Reg : Exp.BottomRegs) {
+ sinkUsedRegs[Reg];
+ }
+ }
+ MapVector<MachineBasicBlock *, GCNRPTracker::LiveRegSet> HoistCandidates;
+ for (auto &it : hotBB.inputLive) {
+ unsigned Reg = it.first;
+ // Skip reg which already used for sink exp.
+ if (sinkUsedRegs.count(Reg))
+ continue;
+ if (usedRegs.count(Reg))
+ continue;
+ // Skip unsafe reg.
+ if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ false)) {
+ LLVM_DEBUG(dbgs() << " is not safe to hoist\n");
+ continue;
+ }
+ // DefMI is already checked in isSafeCandidate.
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ MachineBasicBlock *DefMBB = DefMI->getParent();
+ DenseSet<MachineBasicBlock *> UseMBBSet;
+ // Make sure all uses not in Def block are in same block.
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ MachineBasicBlock *UseMBB = UseMI.getParent();
+ if (UseMBB == DefMBB)
+ continue;
+ UseMBBSet.insert(UseMBB);
+ }
+
+ if (UseMBBSet.size() != 1)
+ continue;
+ MachineBasicBlock *UseMBB = *UseMBBSet.begin();
+ GCNRPTracker::LiveRegSet &UseInMBB = HoistCandidates[UseMBB];
+ UseInMBB[Reg] = getRegMask(DefMI->getOperand(0), MRI);
+ }
+
+ SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+ // Build exp dag on define blocks.
+ std::vector<SubExp> hoistSubExpCandidates;
+ // Save profit candidates into list.
+ for (auto it : HoistCandidates) {
+ MachineBasicBlock *UseMBB = it.first;
+ // Try to remove out reg def sub exp from DefMBB.
+ GCNRPTracker::LiveRegSet &UseInMBB = it.second;
+ // Go up on the dag until reach share node.
+ auto subExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, SIRI,
+ SIII, MRI, slotIndexes);
+ for (SubExp &subExp : subExps) {
+ if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound))
+ continue;
+ subExp.bHoist = true;
+ hoistSubExpCandidates.emplace_back(subExp);
+ }
+ }
+
+ std::pair<int, int> hoistSaving = calculateSaving(
+ hotBB, hoistSubExpCandidates, tmpSavingInputLive, tmpSavingOutputLive,
+ bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+
+ int hoistVgpr = vgpr + hoistSaving.first;
+ int hoistSgpr = sgpr + hoistSaving.second;
+
+ if ((hoistVgpr <= VLimit && hoistSgpr <= SLimit) ||
+ // If status not balance, do the remat even cannot reach target.
+ // TODO: check the result not help even one occupancy.
+ (!hoistSubExpCandidates.empty() && !status.bNotBalance &&
+ TargetOccupancy != 0)) {
+ // nrmSubExps can help reach target occupancy, add it to
+ // subExpCandidates.
+ addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+ addExpCandidates(subExpCandidates, hoistSubExpCandidates, usedRegs);
+
+ return true;
+ }
+ }
+
+ if (EnableVmemDegree &&
+ // Only expect vmem when last tryToAddSubExps.
+ // If not, bAllowPartialUseInSubExp will no chance to be true.
+ (bAllowPartialUseInSubExp ||
+ !EnableSubExpAggressive)) {
+ // Assume vmemLdSize could be optimized by not parallel.
+ if (((vgpr - hotBB.vmemLdInputSize) <= VLimit ||
+ (vgpr - hotBB.vmemLdOutputSize) <= VLimit) &&
+ sgpr <= SLimit) {
+ // nrmSubExps can help reach target occupancy, add it to
+ // subExpCandidates.
+ addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+ return true;
+ }
+ }
+
+ int vDistance = vgpr - (int)VLimit;
+ int sDistance = status.TargetOcc > 4 ? (sgpr - (int)SLimit) : 0;
+ int vSaved = hotBB.maxPressures.first - vgpr;
+ int sSaved = hotBB.maxPressures.second - sgpr;
+ // Try to add inBlockCloneSubExps.
+ if (!tryRematInHotSpot(*hotBB.MBB, status, vDistance, sDistance, vSaved,
+ sSaved, inBlockCloneSubExps, inBlockHotVInstMap,
+ inBlockHotSInstMap, LIS, MRI, SIRI, SIII)) {
+ // return false always when not allow partialUseInSubExp, it will try again
+ // with partialUseInSubExp enabled.
+ if (!bAllowPartialUseInSubExp)
+ return false;
+ // If status not balance, do the remat even cannot reach target.
+ // TODO: check the result not help even one occupancy.
+ if (!status.bNotBalance && TargetOccupancy == 0)
+ return false;
+ }
+ // nrmSubExps can help reach target occupancy, add it to
+ // subExpCandidates.
+ addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+ return true;
+}
+
+// Remat passthru regs per hot block.
+// Reason to do it per block is to make sure passthru reuse is precise.
+// If try remat on all hot blocks together, the passthru might be on one block,
+// but the reuse in on another block which the reg is not passthru there.
+bool perBlockPassthruRemat(Remat *Remat,
+ std::vector<HotBlock> &hotBlocks,
+ RematStatus &status,
+ GCNRPTracker::LiveRegSet &liveRegCandidates,
+ const GCNSubtarget *ST, LiveIntervals *LIS,
+ const MachineLoopInfo *MLI,
+ MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII) {
+ bool bUpdated = false;
+ bool bCanClone = EnableSubExpClone |
+ EnableSubExpAggressive;
+
+ SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+ // Sort hot blocks by pressure first.
+ // The hot block with higher pressure is easier to fail.
+ // If fail, fail fast. It it works, save the subExpCandidates. The
+ // subExpCandidates may help other hotblocks.
+ std::sort(hotBlocks.begin(), hotBlocks.end(),
+ [&ST](const HotBlock &a, const HotBlock &b) {
+ return pressureHigher(a.maxPressures.first, a.maxPressures.second,
+ b.maxPressures.first, b.maxPressures.second,
+ ST);
+ });
+
+ std::vector<SubExp> subExpCandidates;
+ // For inBlock remat clone.
+ std::vector<SubExp> inBlockCloneSubExps;
+ DenseMap<MachineBasicBlock *, MachineInstr *> inBlockHotVInstMap;
+ DenseMap<MachineBasicBlock *, MachineInstr *> inBlockHotSInstMap;
+
+ // Save used passThrus to avoid use same reg on different MBB.
+ GCNRPTracker::LiveRegSet usedPassThrus;
+ // Save moved regs to avoid use same reg hoist and sink.
+ GCNRPTracker::LiveRegSet usedRegs;
+
+ const int VLimit = status.TargetVLimit;
+ const int SLimit = status.TargetSLimit;
+ // Collect passthru for hot block.
+ // Try remat on it.
+ for (auto &it : hotBlocks) {
+ MachineBasicBlock *MBB = it.MBB;
+
+ const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+ const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+
+ it.inputLive = inputLive;
+
+ // Add pressure by 1 to consider spill to vgpr.
+ const int PressureDelta = -1;
+ int vgpr = it.maxPressures.first - PressureDelta;
+ int sgpr = it.maxPressures.second;
+ bool bVOutBound = vgpr > VLimit;
+ bool bSOutBound = sgpr > SLimit;
+ // savingInputLive is used to calculate saving which will be modified to
+ // avoid count same input multiple times.
+ GCNRPTracker::LiveRegSet savingInputLive = inputLive;
+ GCNRPTracker::LiveRegSet savingOutputLive = outputLive;
+ std::pair<int, int> curSaving =
+ calculateSaving(it, subExpCandidates, savingInputLive, savingOutputLive,
+ bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+
+ vgpr += curSaving.first;
+ sgpr += curSaving.second;
+
+ if (vgpr <= VLimit && sgpr <= SLimit)
+ continue;
+
+ // Collect pass thru regs.
+ GCNRPTracker::LiveRegSet passThrus =
+ collectPassThrus(MBB, inputLive, outputLive, usedPassThrus,
+ liveRegCandidates, MRI, bCanClone);
+
+ // Group pass thru regs by def MBB.
+ SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+ Candidates =
+ groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, MRI, SIRI, SIII);
+ // unUsedPassThrus used to collect passThru which is skipped when build
+ // subExp.
+ GCNRPTracker::LiveRegSet unusedPassThrus;
+ // Build exp dag on define blocks.
+ bool bAllowPartialUseInSubExp = false;
+ if (tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+ inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
+ vgpr, sgpr, savingInputLive, savingOutputLive,
+ passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
+ LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+ unusedPassThrus, bAllowPartialUseInSubExp)) {
+ // Remove unusedPassThrus from passThrus first.
+ llvm::andNotLiveRegSet(passThrus, unusedPassThrus);
+ llvm::mergeLiveRegSet(usedPassThrus, passThrus);
+ continue;
+ }
+ // If cannot clone, don't need to try partialUseInSubExp which must clone.
+ if (!bCanClone)
+ return false;
+
+ // Partial use subExp may result big alu count caused by clone.
+ // Only try it when enable aggressive remat.
+ if (!EnableSubExpAggressive)
+ return false;
+
+ bAllowPartialUseInSubExp = true;
+ if (!tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+ inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
+ vgpr, sgpr, savingInputLive, savingOutputLive,
+ passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
+ LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+ unusedPassThrus, bAllowPartialUseInSubExp)) {
+ return false;
+ }
+ // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp.
+ llvm::mergeLiveRegSet(usedPassThrus, passThrus);
+ }
+
+ // Apply changes.
+ {
+ // sort subExpCandidates to make sure input use apply before output use if a
+ // reg is input and output of subExps.
+ LLVM_DEBUG(for (SubExp &Exp : subExpCandidates) { Exp.dump(MRI, SIRI); });
+ sortSubExpCandidates(subExpCandidates);
+
+ for (SubExp &Exp : subExpCandidates) {
+ // Skip exp which is cleared in sort for hoist sink conflict.
+ if (Exp.SUnits.empty())
+ continue;
+ LLVM_DEBUG(Exp.dump(MRI, SIRI));
+ if (Exp.bHoist) {
+ ApplySubExpMoveNearDefine(Exp, MRI, pDT, slotIndexes, SIII, SIRI);
+ } else {
+ if (Exp.bCloneOnly)
+ ApplySubExpCloneNearUser(Exp, hotBlocks, pDT, MRI, slotIndexes, SIII,
+ SIRI);
+ else
+ ApplySubExpMoveNearUser(Exp, MRI, pDT, slotIndexes, SIII, SIRI);
+ }
+ }
+
+ for (SubExp &Exp : inBlockCloneSubExps) {
+ ApplySubExpCloneNearUserInBlock(Exp, inBlockHotVInstMap,
+ inBlockHotSInstMap, MRI, slotIndexes,
+ SIII, SIRI);
+ }
+ // Try to see possible occupancy could reach, then dicide a target.
+ // Apply remat.
+ bUpdated = subExpCandidates.size();
+ }
+
+ return bUpdated;
+}
+
+int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI, const MachineRegisterInfo &MRI) {
+ int vmemLdSize = 0;
+ // Collect vmemLd when enable split.
+ for (MachineInstr &MI : MBB) {
+ bool bIsHighLatency = SIII->isHighLatencyInstruction(MI);
+ if (!bIsHighLatency)
+ continue;
+ if (!(MI.mayLoad() &&
+ // Skip case like atomic which not return value.
+ MI.getNumDefs() > 0))
+ continue;
+ // a vmem ld.
+ MachineOperand &Dst = MI.getOperand(0);
+ LaneBitmask mask = llvm::getRegMask(Dst, MRI);
+ unsigned size = llvm::getRegSize(Dst.getReg(), mask, MRI, SIRI);
+ vmemLdSize += size;
+ }
+ return vmemLdSize;
+}
+
+} // namespace
+
+bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
+ MachineDominatorTree *pDT, MachinePostDominatorTree *pPDT,
+ AliasAnalysis *AA)
+{
+ if (MF.size() < 2)
+ return false;
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+ const SIInstrInfo *SIII = ST->getInstrInfo();
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+ auto &MRI = MF.getRegInfo();
+
+ RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+
+ const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+ if (status.TargetOcc >= MaxOcc)
+ return false;
+
+ unsigned VLimit = status.TargetVLimit;
+ unsigned SLimit = status.TargetSLimit;
+
+ int rematVCnt = status.MaxVPressure - VLimit;
+ int rematSCnt = status.MaxSPressure - SLimit;
+
+ bool bSGPRSpill = false;
+ if (rematSCnt > 0) {
+ bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+ }
+
+ // If bound by lds, skip.
+ if ((status.TargetOcc + 1) > ST->getOccupancyWithLocalMemSize(MF) &&
+ !bSGPRSpill)
+ return false;
+
+ bool bBothOutLimit = rematVCnt > 0 && rematSCnt > 0;
+ // TODO: use check wqm and support vreg remat.
+ bool bCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+ rematVCnt = bCheckWQM & false;
+
+ // Remat on every hot block.
+
+ // Collect all hot blocks.
+ std::vector<HotBlock> hotBlocks;
+ for (MachineBasicBlock &MBB : MF) {
+ // Collect reg pressure.
+ auto &RP = status.MBBPressureMap[&MBB];
+ unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+ unsigned maxLocalSPressure = RP.getMaxSGPR();
+
+ maxLocalSPressure += RegForVCC;
+
+ if (!EnableInBlockRemat) {
+ if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
+ continue;
+ }
+
+ // Move inst which input is imm/pass thru reg/out reg to help pressure.
+ if (tryHoldPacifist(MBB, LIS, MRI, SIRI, SIII, AA, status)) {
+ maxLocalVPressure = 0;
+ maxLocalSPressure = 0;
+ CollectMBBPressure(MBB, LIS, MRI, ST, maxLocalVPressure,
+ maxLocalSPressure, status);
+
+ maxLocalSPressure += RegForVCC;
+
+ }
+ if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
+ continue;
+
+ // When both vgpr sgpr out limit, only help vgpr.
+ if (bBothOutLimit && maxLocalVPressure <= VLimit)
+ continue;
+ GCNRPTracker::LiveRegSet liveSet;
+ hotBlocks.push_back({ &MBB, liveSet,std::make_pair(maxLocalVPressure, maxLocalSPressure), 0, 0 });
+ }
+ // Collect vmemLdInput/OutputSize.
+ if (EnableVmemDegree) {
+ DenseMap<MachineBasicBlock *, unsigned> outputVMemLdSizeMap;
+ for (auto it : hotBlocks) {
+ MachineBasicBlock *MBB = it.MBB;
+ // Collect vmemLd when enable split.
+ int vmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI);
+ if (vmemLdSize) {
+ outputVMemLdSizeMap[MBB] = vmemLdSize;
+ }
+ }
+ for (auto &it : hotBlocks) {
+ MachineBasicBlock *MBB = it.MBB;
+
+ auto oit = outputVMemLdSizeMap.find(MBB);
+ if (oit != outputVMemLdSizeMap.end())
+ it.vmemLdOutputSize = oit->second;
+
+ if (MBB->pred_size() != 1)
+ continue;
+
+ MachineBasicBlock *Pred = *MBB->pred_begin();
+ oit = outputVMemLdSizeMap.find(Pred);
+ if (oit != outputVMemLdSizeMap.end()) {
+ it.vmemLdInputSize = oit->second;
+ } else {
+ if (Pred->getFirstTerminator() != Pred->end())
+ continue;
+ if (Pred->empty())
+ continue;
+ bool bIsHighLatency = SIII->isHighLatencyInstruction(Pred->back());
+ if (!bIsHighLatency)
+ continue;
+ int vmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI);
+ it.vmemLdInputSize = vmemLdSize;
+ }
+ }
+ }
+
+ if (EnableUniformVectorToScalar) {
+ if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, hotBlocks, LIS, MRI,
+ SIRI, SIII, MLI)) {
+ // Rebuild LIS.
+ LIS->reanalyze(MF);
+ status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+ bool bSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF);
+ if (bSgprSpilled) {
+ bool bNearTarget = false;
+ hotBlockRemat(Remat, MF, MLI, LIS, pDT, pPDT, bNearTarget);
+ // Rebuild LIS.
+ LIS->reanalyze(MF);
+ status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+ }
+
+ for (auto &it : hotBlocks) {
+ MachineBasicBlock *MBB = it.MBB;
+
+ // Update pressure.
+ auto &RP = status.MBBPressureMap[MBB];
+ unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+ unsigned maxLocalSPressure = RP.getMaxSGPR();
+
+ maxLocalSPressure += RegForVCC;
+ it.maxPressures.first = maxLocalVPressure;
+ it.maxPressures.second = maxLocalSPressure;
+ }
+ }
+ }
+
+ // Collect all live reg which cross hot blocks.
+ GCNRPTracker::LiveRegSet liveRegCandidates;
+ for (auto it : hotBlocks) {
+ MachineBasicBlock *MBB = it.MBB;
+
+ const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+
+ const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+
+ llvm::mergeLiveRegSet(liveRegCandidates, inputLive);
+ llvm::mergeLiveRegSet(liveRegCandidates, outputLive);
+ }
+
+ // Check min VGPR bound.
+ BlockSet PressureUnderLimitSet;
+ if (EnableSubExpMinReg) {
+ for (auto &it : hotBlocks) {
+ MachineBasicBlock *MBB = it.MBB;
+ unsigned MaxLocalVGPR = 0;
+ unsigned MaxLocalSGPR = 0;
+ llvm::getRegBound(MBB, MRI, SIRI, SIII, LIS, MaxLocalVGPR, MaxLocalSGPR);
+
+ if (MaxLocalVGPR < VLimit && MaxLocalSGPR < SLimit) {
+ PressureUnderLimitSet.insert(MBB);
+ } else {
+ if (MaxLocalVGPR < it.maxPressures.first)
+ it.maxPressures = std::make_pair(MaxLocalVGPR, it.maxPressures.second);
+ if (MaxLocalSGPR < it.maxPressures.second)
+ it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR);
+ }
+ }
+ }
+
+ bool bUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates,
+ ST, LIS, MLI, pDT, MRI, SIRI, SIII);
+
+ return bUpdated;
+}
+
+bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.size() < 2)
+ return false;
+ LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+ MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ {
+ llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI);
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (DA.isUniform(&MI)) {
+ TotalUniformInsts.insert(&MI);
+ }
+ }
+ }
+ }
+
+ //LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
+ // For non-cs/ps, set target occ as 4.
+ bool bNearTarget = false;
+ bool bFinalUpdated = false;
+ bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget);
+ bFinalUpdated |= bUpdated;
+ if (EnableSubExp) {
+ if (bUpdated) {
+ // Rebuild LIS.
+ LIS->reanalyze(MF);
+ }
+
+ bUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA);
+
+ bFinalUpdated |= bUpdated;
+ }
+ return bFinalUpdated;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+ "AMDGPU rematerialize", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, "AMDGPU rematerialize",
+ false, false)
+
+char AMDGPUHotBlockRematerialize::ID = 0;
+char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
+
+FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
+ return new AMDGPUHotBlockRematerialize();
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
new file mode 100644
index 000000000000000..6f44fec08239cde
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -0,0 +1,2241 @@
+///////////////////////////////////////////////////////////////////////////////
+// //
+// AMDGPUMIRUtils.cpp //
+// Copyright (C) Microsoft Corporation. All rights reserved. //
+// This file is distributed under the University of Illinois Open Source //
+// License. See LICENSE.TXT for details. //
+// //
+// Util functions for llvm MIR Passes. //
+// //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+
+//#include "dxc/DXIL/DxilMetadataHelper.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/Support/GraphWriter.h"
+
+#include "llvm/Support/Debug.h"
+
+#include "GCNRegPressure.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUSubExpDag.h"
+#include <unordered_set>
+
+#define DEBUG_TYPE "xb-mir-util"
+using namespace llvm;
+namespace {
+class CFGWithPhi {
+public:
+ CFGWithPhi(MachineFunction &F) : F(F) {
+ // Collect phi and phi related insts.
+ MachineRegisterInfo &MRI = F.getRegInfo();
+
+ for (MachineBasicBlock &BB : F) {
+ auto &phiInsts = blockToPhiInstsMap[&BB];
+ for (MachineInstr &I : BB) {
+ if (!I.isPHI())
+ break;
+ phiInsts.insert(&I);
+ unsigned Reg = I.getOperand(0).getReg();
+ // Add incoming values.
+ for (unsigned i=1;i<I.getNumOperands();i+=2) {
+ MachineOperand &MO = I.getOperand(i);
+ if (!MO.isReg())
+ continue;
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(MO.getReg());
+ if (!DefMI)
+ continue;
+ blockToPhiInstsMap[DefMI->getParent()].insert(DefMI);
+ }
+ // Add users.
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ blockToPhiInstsMap[UseMI.getParent()].insert(&UseMI);
+ }
+ }
+ }
+ } /// Adds custom features for a visualization of the ScheduleDAG.
+ void addCustomGraphFeatures(llvm::GraphWriter<CFGWithPhi *> &) const {}
+ MachineFunction &F;
+ DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>> blockToPhiInstsMap;
+ void dump();
+};
+
+void CFGWithPhi::dump() {
+#ifdef DBG
+ for (MachineBasicBlock &BB : F) {
+ dbgs() << BB.getName() << "\n";
+ auto &phiInsts = blockToPhiInstsMap[&BB];
+ for (MachineInstr *I : phiInsts) {
+ if (!I->isPHI())
+ continue;
+ I->dump();
+ }
+ for (MachineInstr *I : phiInsts) {
+ if (I->isPHI())
+ continue;
+ I->dump();
+ }
+ }
+#endif
+}
+
+} // namespace
+
+// CFGWithPhi dump.
+namespace llvm {
+
+template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
+
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+ static std::string getGraphName(const CFGWithPhi *G) {
+ return "CFG with Phi graph";
+ }
+
+ static std::string getNodeIdentifierLabel(const MachineBasicBlock *Node,
+ const CFGWithPhi *Graph) {
+ std::string R;
+ raw_string_ostream OS(R);
+ OS << static_cast<const void *>(Node);
+ return R;
+ }
+
+ static std::string getNodeLabel(const MachineBasicBlock *BB, const CFGWithPhi *G) {
+ enum { MaxColumns = 8000 };
+ std::string Str;
+ raw_string_ostream OS(Str);
+
+ OS << "BB:" << BB->getName();
+ auto it = G->blockToPhiInstsMap.find(BB);
+ if (it != G->blockToPhiInstsMap.end()) {
+
+ auto &phiInsts = it->second;
+ for (MachineInstr *I : phiInsts) {
+ if (!I->isPHI())
+ continue;
+ I->print(OS);
+ OS << "\n";
+ }
+ for (MachineInstr *I : phiInsts) {
+ if (I->isPHI())
+ continue;
+ I->print(OS);
+ OS << "\n";
+ }
+ }
+ std::string OutStr = OS.str();
+ if (OutStr[0] == '\n')
+ OutStr.erase(OutStr.begin());
+
+ // Process string output to make it nicer...
+ unsigned ColNum = 0;
+ unsigned LastSpace = 0;
+ for (unsigned i = 0; i != OutStr.length(); ++i) {
+ if (OutStr[i] == '\n') { // Left justify
+ OutStr[i] = '\\';
+ OutStr.insert(OutStr.begin() + i + 1, 'l');
+ ColNum = 0;
+ LastSpace = 0;
+ } else if (OutStr[i] == ';') { // Delete comments!
+ unsigned Idx = OutStr.find('\n', i + 1); // Find end of line
+ OutStr.erase(OutStr.begin() + i, OutStr.begin() + Idx);
+ --i;
+ } else if (ColNum == MaxColumns) { // Wrap lines.
+ // Wrap very long names even though we can't find a space.
+ if (!LastSpace)
+ LastSpace = i;
+ OutStr.insert(LastSpace, "\\l...");
+ ColNum = i - LastSpace;
+ LastSpace = 0;
+ i += 3; // The loop will advance 'i' again.
+ } else
+ ++ColNum;
+ if (OutStr[i] == ' ')
+ LastSpace = i;
+ }
+ return OutStr;
+ }
+ static std::string getNodeDescription(const MachineBasicBlock *SU,
+ const CFGWithPhi *G) {
+ return SU->getName().str();
+ }
+
+ static void addCustomGraphFeatures(CFGWithPhi *G,
+ GraphWriter<CFGWithPhi *> &GW) {
+ return G->addCustomGraphFeatures(GW);
+ }
+};
+
+template <> struct GraphTraits<CFGWithPhi *> {
+ using NodeRef = MachineBasicBlock *;
+ using ChildIteratorType = MachineBasicBlock::succ_iterator;
+ using nodes_iterator = pointer_iterator<MachineFunction::iterator>;
+
+ // static NodeRef getEntryNode(const CFGWithPhi *G) {
+ // return G->F.getFunctionEntry();
+ //}
+
+ static ChildIteratorType child_begin(const NodeRef N) {
+ return N->succ_begin();
+ }
+
+ static ChildIteratorType child_end(const NodeRef N) { return N->succ_end(); }
+
+ static nodes_iterator nodes_begin(const CFGWithPhi *G) {
+ return nodes_iterator(G->F.begin());
+ }
+
+ static nodes_iterator nodes_end(const CFGWithPhi *G) {
+ return nodes_iterator(G->F.end());
+ }
+};
+
+} // namespace llvm
+
+namespace llvm {
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI) {
+ unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+ Size >>= 5;
+ LaneBitmask mask = Mask;
+ if (mask.any()) {
+ if (unsigned maskSize = mask.getNumLanes()) {
+ if (maskSize < Size)
+ Size = maskSize;
+ }
+ }
+ return Size;
+}
+
+void CollectLiveSetPressure(const LiveSet &liveSet,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, unsigned &VPressure,
+ unsigned &SPressure) {
+ VPressure = 0;
+ SPressure = 0;
+ for (auto liveIt : liveSet) {
+ unsigned Reg = liveIt.first;
+ unsigned Size = getRegSize(Reg, liveIt.second, MRI, SIRI);
+ if (SIRI->isVGPR(MRI, Reg)) {
+ VPressure += Size;
+ } else {
+ SPressure += Size;
+ }
+ }
+}
+
+bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) {
+ bool isExecUpdate = false;
+ unsigned opcode = MI.getOpcode();
+ if (opcode == AMDGPU::S_MOV_B64 || opcode == AMDGPU::S_MOV_B32 ||
+ opcode == AMDGPU::S_OR_B64_term || opcode == AMDGPU::S_OR_B32_term ||
+ opcode == AMDGPU::S_OR_SAVEEXEC_B64 ||
+ opcode == AMDGPU::S_OR_SAVEEXEC_B32 || opcode == AMDGPU::S_AND_B64 ||
+ opcode == AMDGPU::S_AND_B32 || opcode == AMDGPU::S_ANDN2_B64 ||
+ opcode == AMDGPU::S_ANDN2_B32) {
+ MachineOperand &Dst = MI.getOperand(0);
+ if (Dst.getReg() == AMDGPU::EXEC || Dst.getReg() == AMDGPU::EXEC_LO) {
+ isExecUpdate = true;
+ }
+ }
+ return isExecUpdate;
+}
+
+bool IsSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+ // Support multi def for pattern of pointer:
+ // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+ // %808.sub1:sgpr_64 = S_MOV_B32 0
+ bool bHasSub0 = false;
+ bool bHasSub1 = false;
+ for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) {
+ if (unsigned SubReg = UserDefMO.getSubReg()) {
+ bool bSingleSubReg = false;
+ switch (SubReg) {
+ default:
+ break;
+ case AMDGPU::sub0:
+ if (!bHasSub0) {
+ bHasSub0 = true;
+ bSingleSubReg = true;
+ }
+ break;
+ case AMDGPU::sub1:
+ if (!bHasSub1) {
+ bHasSub1 = true;
+ bSingleSubReg = true;
+ }
+ break;
+ }
+ if (!bSingleSubReg) {
+ bHasSub0 = false;
+ break;
+ }
+ } else {
+ bHasSub0 = false;
+ break;
+ }
+ }
+
+ return (bHasSub0 && bHasSub1);
+}
+
+LaneBitmask getRegMask(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) {
+ // We don't rely on read-undef flag because in case of tentative schedule
+ // tracking it isn't set correctly yet. This works correctly however since
+ // use mask has been tracked before using LIS.
+ return MO.getSubReg() == 0
+ ? MRI.getMaxLaneMaskForVReg(MO.getReg())
+ : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(
+ MO.getSubReg());
+}
+
+void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+ for (auto Reg : inputSet) {
+ unsigned reg = Reg.first;
+ LaneBitmask mask = Reg.second;
+ auto targetReg = targetSet.find(reg);
+ if (targetReg != targetSet.end()) {
+ LaneBitmask targetMask = targetReg->second;
+ mask |= targetMask;
+ }
+ targetSet[reg] = mask;
+ }
+}
+
+void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+ GCNRPTracker::LiveRegSet AndSet;
+ for (auto Reg : inputSet) {
+ unsigned reg = Reg.first;
+ LaneBitmask mask = Reg.second;
+ auto targetReg = targetSet.find(reg);
+ if (targetReg != targetSet.end()) {
+ LaneBitmask targetMask = targetReg->second;
+ mask &= targetMask;
+ AndSet[reg] = mask;
+ }
+ }
+
+ targetSet = AndSet;
+}
+
+void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+ for (auto Reg : inputSet) {
+ unsigned reg = Reg.first;
+ LaneBitmask mask = Reg.second;
+ auto targetReg = targetSet.find(reg);
+ if (targetReg != targetSet.end()) {
+ LaneBitmask targetMask = targetReg->second;
+ if ((targetMask | mask) == mask)
+ targetSet.erase(reg);
+ else
+ targetSet[reg] = targetMask & (~mask);
+ }
+ }
+}
+
+MachineBasicBlock *split(MachineInstr *Inst) {
+
+ // Create the fall-through block.
+ MachineBasicBlock *MBB = Inst->getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock();
+ auto MBBIter = ++(MBB->getIterator());
+ MF->insert(MBBIter, SuccMBB);
+ SuccMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MBB->addSuccessor(SuccMBB);
+
+ // Splice the code over.
+ SuccMBB->splice(SuccMBB->end(), MBB, ++Inst->getIterator(), MBB->end());
+
+ return SuccMBB;
+}
+
+struct Piece {
+ unsigned Reg;
+ unsigned offset;
+ unsigned size;
+ static SmallVector<Piece, 8> split(std::bitset<32> mask) {
+
+ SmallVector<Piece, 8> pieces;
+ Piece piece = {0, 0, 0};
+ for (unsigned i = 0; i < 32; i++) {
+ if (mask.test(i)) {
+ if (piece.size == 0)
+ piece.offset = i;
+
+ piece.size++;
+ // Make sure no piece bigger than 8.
+ if (piece.size == 8) {
+ pieces.emplace_back(piece);
+ piece.size = 0;
+ }
+ } else {
+ if (piece.size == 0) {
+ continue;
+ }
+ pieces.emplace_back(piece);
+ piece.size = 0;
+ }
+ }
+ return pieces;
+ }
+};
+
+void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC,
+ unsigned offset, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII) {
+ unsigned size = NewRC->getLaneMask().getNumLanes();
+ if (size == 1) {
+ UseMO.setSubReg(0);
+ } else {
+ const uint32_t SubReg = UseMO.getSubReg();
+ LaneBitmask Mask = SIRI->getSubRegIndexLaneMask(SubReg);
+
+ unsigned mask = Mask.getAsInteger() >> offset;
+
+ unsigned NewSubReg = SIRI->getMinimalSpanningSubRegIdxSetForLaneMask(
+ NewRC, LaneBitmask(mask))
+ .front();
+
+ UseMO.setSubReg(NewSubReg);
+ }
+}
+
+bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
+ MachineOperand &DstMO = MI.getOperand(0);
+ // Skip case when dst subReg not 0.
+ if (DstMO.getSubReg()) {
+ return false;
+ }
+ unsigned Reg = DstMO.getReg();
+
+ SmallVector<MachineOperand *, 2> UseMOs;
+ for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) {
+ UseMOs.emplace_back(&UseMO);
+ }
+
+ const llvm::TargetRegisterClass *NewRC =
+ SIRI->getRegClass(desc.operands().front().RegClass);
+ unsigned size = NewRC->getLaneMask().getNumLanes();
+ if (offset > 0) {
+ // Update offset operand in MI.
+ MachineOperand *OffsetOp =
+ SIII->getNamedOperand(MI, AMDGPU::OpName::offset);
+
+ const uint32_t LaneSize = sizeof(uint32_t);
+ if (OffsetOp) {
+ if (OffsetOp->isImm()) {
+ assert(OffsetOp != nullptr);
+ int64_t Offset = OffsetOp->getImm();
+ Offset += offset * LaneSize;
+ if (!SIII->isLegalMUBUFImmOffset(Offset)) {
+ return false;
+ }
+ OffsetOp->setImm(Offset);
+ } else {
+ return false;
+ }
+ } else {
+ OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+ if (OffsetOp) {
+ unsigned NewOffsetReg =
+ MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(),
+ SIII->get(AMDGPU::S_ADD_U32))
+ .addDef(NewOffsetReg)
+ .add(*OffsetOp)
+ .addImm(offset * LaneSize);
+ MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
+ MachineBasicBlock::iterator InsertPoint =
+ llvm::FindOrCreateInsertionPointForSccDef(
+ MI.getParent(), MI, SIRI, SIII, &MRI
+ );
+ MI.getParent()->insert(InsertPoint, OffsetAddMI);
+ SIII->legalizeOperands(*OffsetAddMI);
+ OffsetOp->setReg(NewOffsetReg);
+ OffsetOp->setSubReg(0);
+ if (SlotIndexes)
+ SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI);
+ } else {
+ return false;
+ }
+ }
+ // Update subReg for users.
+ for (MachineOperand *UseMO : UseMOs) {
+ updateSubReg(*UseMO, NewRC, offset, SIRI, SIII);
+ }
+ } else if (size == 1) {
+ // Clear subReg when size is 1.
+ for (MachineOperand *UseMO : UseMOs) {
+ UseMO->setSubReg(0);
+ }
+ }
+
+ MI.setDesc(desc);
+ // Mutate reg class of Reg.
+ MRI.setRegClass(Reg, NewRC);
+ return true;
+}
+
+bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ SlotIndexes *SlotIndexes) {
+ bool bImm = false;
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM:
+ bImm = true;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+ unsigned Reg = MI.getOperand(0).getReg();
+ if (!MRI.getUniqueVRegDef(Reg))
+ return false;
+ LaneBitmask dstMask = getRegMask(MI.getOperand(0), MRI);
+ LaneBitmask UseMask;
+ for (MachineOperand &MO : MRI.use_operands(Reg)) {
+ UseMask |= llvm::getRegMask(MO, MRI);
+ }
+
+ const unsigned fullMask = dstMask.getAsInteger();
+ unsigned mask = UseMask.getAsInteger();
+ if (mask == fullMask)
+ return false;
+ // Split mask when there's gap. Then group mask to 2/4/8.
+ auto pieces = Piece::split(std::bitset<32>(mask));
+ // Now only support 1 piece.
+ if (pieces.size() != 1)
+ return false;
+ auto piece = pieces[0];
+ if (piece.size > 8)
+ return false;
+
+ // TODO: enable offset support when bImm is true.
+ // Now if break different test when mul LaneSize or not mul for the offset.
+ if (bImm && piece.offset != 0)
+ return false;
+
+ switch (piece.size) {
+ default:
+ return false;
+ case 1:
+ return reduceChannel(piece.offset, MI,
+ SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
+ : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
+ MRI, SIRI, SIII, SlotIndexes);
+ case 2:
+ return reduceChannel(piece.offset, MI,
+ SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
+ MRI, SIRI, SIII, SlotIndexes);
+ case 3:
+ if (fullMask == 0xf)
+ return false;
+ case 4:
+ return reduceChannel(piece.offset, MI,
+ SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
+ MRI, SIRI, SIII, SlotIndexes);
+ case 5:
+ case 6:
+ case 7:
+ if (fullMask == 0xff)
+ return false;
+ case 8:
+ return reduceChannel(piece.offset, MI,
+ SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
+ MRI, SIRI, SIII, SlotIndexes);
+ }
+
+ } break;
+ }
+ return false;
+}
+
+// LoopInfo contains a mapping from basic block to the innermost loop. Find
+// the outermost loop in the loop nest that contains BB.
+const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI,
+ const MachineBasicBlock *BB) {
+ const MachineLoop *L = LI->getLoopFor(BB);
+ if (L) {
+ while (const MachineLoop *Parent = L->getParentLoop())
+ L = Parent;
+ }
+ return L;
+}
+
+// True if there is a loop which contains both BB1 and BB2.
+bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
+ const MachineBasicBlock *BB2) {
+ const MachineLoop *L1 = getOutermostLoop(LI, BB1);
+ const MachineLoop *L2 = getOutermostLoop(LI, BB2);
+ return L1 != nullptr && L1 == L2;
+}
+
+bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+ MachineBasicBlock *ToBB) {
+ if (FromBB == ToBB) {
+ return true;
+ }
+
+ if (DT->dominates(FromBB, ToBB)) {
+ return true;
+ }
+
+ if (PDT->dominates(ToBB, FromBB)) {
+ return true;
+ }
+
+ if (loopContainsBoth(LI, ToBB, FromBB)) {
+ return true;
+ }
+ // TODO: cover case hotBB in loop,
+ // one block in that loop dom BB or
+ // BB post dom one block in that loop.
+ return false;
+}
+
+// If BB can reach hotMBBs.
+bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+ DenseSet<MachineBasicBlock *> &hotMBBs) {
+ bool bCross = false;
+ for (MachineBasicBlock *hotBB : hotMBBs) {
+ if (reach_block(BB, DT, PDT, LI, hotBB)) {
+ bCross = true;
+ break;
+ }
+ }
+ return bCross;
+}
+
+}
+
+namespace llvm {
+void viewCFGWithPhi(llvm::MachineFunction &F) {
+#ifdef DBG
+ CFGWithPhi G(F);
+ ViewGraph(const_cast<CFGWithPhi *>(&G), F.getName(), false, F.getName());
+ G.dump();
+#endif
+}
+} // namespace llvm
+
+namespace llvm {
+bool GetNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
+ MachineBasicBlock &MBB) {
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ while (BBEnd != MBB.rend() && BBEnd->isDebugInstr())
+ BBEnd++;
+ return BBEnd != MBB.rend();
+}
+} // namespace llvm
+
+// Helper functions to write jason.
+namespace {
+void json_name(StringRef Val, raw_ostream &os) { os << "\"" << Val << "\":"; }
+
+template <typename write_fn>
+void json_pair(StringRef Val, write_fn &fn, raw_ostream &os) {
+ json_name(Val, os);
+ os << "\"";
+ fn();
+ os << "\"";
+}
+
+template <typename write_fn>
+void json_obj_pair(StringRef Val, write_fn &fn, raw_ostream &os) {
+ json_name(Val, os);
+
+ fn();
+}
+
+template <typename write_fn>
+void json_array(StringRef Val, write_fn &fn, raw_ostream &os) {
+ json_name(Val, os);
+ os << "[";
+ fn();
+ os << "]";
+}
+} // namespace
+
+namespace llvm {
+namespace pressure {
+
+void write_inst(MachineInstr &MI, const SlotIndexes *SlotIndexes,
+ const SIInstrInfo *SIII, raw_ostream &os) {
+ os << "{";
+ SlotIndex Slot = SlotIndexes->getInstructionIndex(MI);
+ auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+ json_pair("slot_index", writeSlot, os);
+
+ os << ",";
+
+ auto writeOpcode = [&MI, &SIII, &os]() {
+ os << SIII->getName(MI.getOpcode());
+ };
+
+ json_pair("opcode", writeOpcode, os);
+
+ os << ",";
+
+ auto writeAsm = [&MI, &SIII, &os]() {
+ MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false,
+ /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII);
+ };
+ json_pair("asm", writeAsm, os);
+
+ os << "}";
+}
+
+void print_reg(Register Reg, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, raw_ostream &os) {
+ if (Reg.isVirtual()) {
+ StringRef Name = MRI.getVRegName(Reg);
+ if (Name != "") {
+ os << '%' << Name;
+ } else {
+ os << '%' << Register::virtReg2Index(Reg);
+ }
+ } else if (Reg < SIRI->getNumRegs()) {
+ os << '$';
+ printLowerCase(SIRI->getName(Reg), os);
+ } else {
+ llvm_unreachable("invalid reg");
+ }
+}
+
+void write_reg(unsigned Reg, unsigned SubReg, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, raw_ostream &os) {
+ os << "{";
+
+ auto writeReg = [&MRI, &SIRI, &Reg, &os]() { print_reg(Reg, MRI, SIRI, os); };
+ json_pair("reg", writeReg, os);
+
+ os << ",";
+
+ auto writeSubReg = [&SubReg, &os]() { os << SubReg; };
+
+ json_pair("sub_reg", writeSubReg, os);
+
+ os << ",";
+ auto writeIsSgpr = [&Reg, &MRI, &SIRI, &os]() {
+ if (SIRI->isSGPRReg(MRI, Reg))
+ os << "true";
+ else
+ os << "false";
+ };
+ json_obj_pair("is_sgpr", writeIsSgpr, os);
+ os << "}";
+}
+
+unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
+ return SIRI->getRegClassForReg(MRI, Reg)->getLaneMask().getNumLanes();
+}
+
+void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, raw_ostream &os) {
+ if (Mask.none()) {
+ unsigned size = get_reg_size(Reg, MRI, SIRI);
+ Mask = LaneBitmask((1 << size) - 1);
+ }
+ unsigned mask = Mask.getAsInteger();
+ for (unsigned i = 0; i <= Mask.getHighestLane(); i++) {
+ if (mask & (1 << i)) {
+ write_reg(Reg, i, MRI, SIRI, os);
+ os << ",\n";
+ }
+ }
+}
+
+void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, raw_ostream &os) {
+ os << "{";
+ auto writeID = [&ID, &os]() { os << ID; };
+
+ json_pair("ID", writeID, os);
+
+ os << ",";
+
+ auto writeReg = [®, &MRI, &SIRI, &os]() { print_reg(reg, MRI, SIRI, os); };
+
+ json_pair("reg", writeReg, os);
+
+ os << ",";
+
+ auto writeMask = [&mask, &os]() { os << mask; };
+
+ json_pair("mask", writeMask, os);
+
+ os << "},\n";
+}
+
+void write_dag_inst_node(unsigned ID, SlotIndex Slot,
+ GCNRPTracker::LiveRegSet LiveReg,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, SUnit *SU,
+ raw_ostream &os) {
+ os << "{";
+ auto writeID = [&ID, &os]() { os << ID; };
+
+ json_pair("ID", writeID, os);
+
+ os << ",";
+
+ auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+ json_pair("slot_index", writeSlot, os);
+
+ os << ",";
+
+ auto writeRegs = [&LiveReg, &MRI, &SIRI, &os]() {
+ for (auto it : LiveReg) {
+ unsigned Reg = it.first;
+ LaneBitmask Mask = it.second;
+ write_live(Reg, Mask, MRI, SIRI, os);
+ }
+ };
+ json_array("regs", writeRegs, os);
+
+ os << ",";
+
+ auto writePreds = [&SU, &os]() {
+ for (auto &Pred : SU->Preds) {
+
+ os << Pred.getSUnit()->NodeNum << ",";
+ }
+ };
+
+ json_array("preds", writePreds, os);
+
+ os << "},\n";
+}
+
+void write_block(MachineBasicBlock &Blk, LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, raw_ostream &os) {
+ os << "{\n";
+ auto writeName = [&Blk, &os]() { os << Blk.getName(); };
+ json_pair("name", writeName, os);
+
+ os << ",";
+
+ auto writeIndex = [&Blk, &os]() { os << Blk.getNumber(); };
+ json_pair("id", writeIndex, os);
+
+ os << ",";
+
+ const SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
+
+ SlotIndex BeginSlot = SlotIndexes->getMBBStartIdx(&Blk);
+ auto writeSlot = [&BeginSlot, &os]() { BeginSlot.print(os); };
+ json_pair("begin_slot", writeSlot, os);
+
+ os << ",";
+
+ SlotIndex EndSlot = SlotIndexes->getMBBEndIdx(&Blk);
+ auto writeEndSlot = [&EndSlot, &os]() { EndSlot.print(os); };
+ json_pair("end_slot", writeEndSlot, os);
+
+ os << ",";
+
+ auto writeInsts = [&Blk, &SlotIndexes, &SIII, &os]() {
+ for (MachineInstr &MI : Blk) {
+ if (MI.isDebugInstr())
+ continue;
+ write_inst(MI, SlotIndexes, SIII, os);
+ os << ",\n";
+ }
+ };
+
+ json_array("instructions", writeInsts, os);
+
+ os << ",";
+
+ BlockExpDag dag(&Blk, LIS, MRI, SIRI, SIII);
+ dag.buildWithPressure();
+
+ const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *dag.LIS, dag.MRI);
+ auto writeInputs = [&StartLiveReg, &dag, &os]() {
+ for (auto it : StartLiveReg) {
+ unsigned Reg = it.first;
+ LaneBitmask mask = it.second;
+ SUnit *SU = dag.InputSUnitMap[Reg];
+ // Write Reg and mask to the nodes.
+ write_dag_input_node(SU->NodeNum, Reg, mask.getAsInteger(), dag.MRI,
+ dag.SIRI, os);
+ }
+ };
+
+ json_array("input_nodes", writeInputs, os);
+
+ os << ",";
+
+ auto writeNodes = [&SlotIndexes, &dag, &os]() {
+ for (auto it : dag.MISUnitMap) {
+ MachineInstr *MI = it.first;
+ SUnit *SU = it.second;
+ // Use SlotIndex of MI.
+ SlotIndex SlotIndex;
+ if (!MI->isDebugInstr())
+ SlotIndex = SlotIndexes->getInstructionIndex(*MI);
+ GCNRPTracker::LiveRegSet LiveReg = dag.DagPressureMap[SU];
+ // Write slot, live to the nodes.
+ write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, dag.MRI, dag.SIRI,
+ SU, os);
+ }
+ };
+
+ json_array("inst_nodes", writeNodes, os);
+
+ os << ",";
+
+ auto writePreds = [&Blk, &os]() {
+ for (MachineBasicBlock *Pred : Blk.predecessors()) {
+ os << Pred->getNumber() << ",";
+ }
+ };
+
+ json_array("preds", writePreds, os);
+
+ os << ",";
+
+ auto writeSuccs = [&Blk, &os]() {
+ for (MachineBasicBlock *Succ : Blk.successors()) {
+ os << Succ->getNumber() << ",";
+ }
+ };
+
+ json_array("succs", writeSuccs, os);
+
+ os << "}";
+}
+
+void write_define(SlotIndex &Slot, unsigned Reg, unsigned SubReg,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ raw_ostream &os) {
+ os << "{";
+ auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+ json_pair("slot_index", writeSlot, os);
+
+ os << ",";
+
+ auto writeReg = [&MRI, &SIRI, &Reg, &SubReg, &os]() {
+ write_reg(Reg, SubReg, MRI, SIRI, os);
+ };
+ json_obj_pair("reg", writeReg, os);
+
+ os << "}\n";
+
+ os << ",";
+}
+
+void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ raw_ostream &os) {
+ // Split subReg? MO.getSubReg();
+ unsigned Reg = MO.getReg();
+ unsigned SubReg = MO.getSubReg();
+ MachineInstr *MI = MO.getParent();
+ SlotIndex Slot = SlotIndexes->getInstructionIndex(*MI);
+ if (SubReg == 0) {
+ unsigned size = get_reg_size(Reg, MRI, SIRI);
+ for (unsigned i = 0; i < size; i++) {
+ write_define(Slot, Reg, i, MRI, SIRI, os);
+ }
+ } else {
+ switch (SubReg) {
+ default:
+ assert(0 && "SubReg not supported yet.");
+ write_define(Slot, Reg, SubReg, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub0:
+ write_define(Slot, Reg, 0, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub1:
+ write_define(Slot, Reg, 1, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub2:
+ write_define(Slot, Reg, 2, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub3:
+ write_define(Slot, Reg, 3, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub4:
+ write_define(Slot, Reg, 4, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub5:
+ write_define(Slot, Reg, 5, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub6:
+ write_define(Slot, Reg, 6, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub7:
+ write_define(Slot, Reg, 7, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub8:
+ write_define(Slot, Reg, 8, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub9:
+ write_define(Slot, Reg, 9, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub10:
+ write_define(Slot, Reg, 10, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub11:
+ write_define(Slot, Reg, 11, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub12:
+ write_define(Slot, Reg, 12, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub13:
+ write_define(Slot, Reg, 13, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub14:
+ write_define(Slot, Reg, 14, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub15:
+ write_define(Slot, Reg, 15, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub0_sub1:
+ write_define(Slot, Reg, 0, MRI, SIRI, os);
+ write_define(Slot, Reg, 1, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub2_sub3:
+ write_define(Slot, Reg, 2, MRI, SIRI, os);
+ write_define(Slot, Reg, 3, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub4_sub5:
+ write_define(Slot, Reg, 4, MRI, SIRI, os);
+ write_define(Slot, Reg, 5, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub1_sub2:
+ write_define(Slot, Reg, 1, MRI, SIRI, os);
+ write_define(Slot, Reg, 2, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub0_sub1_sub2:
+ write_define(Slot, Reg, 0, MRI, SIRI, os);
+ write_define(Slot, Reg, 1, MRI, SIRI, os);
+ write_define(Slot, Reg, 2, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub0_sub1_sub2_sub3:
+ write_define(Slot, Reg, 0, MRI, SIRI, os);
+ write_define(Slot, Reg, 1, MRI, SIRI, os);
+ write_define(Slot, Reg, 2, MRI, SIRI, os);
+ write_define(Slot, Reg, 3, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub2_sub3_sub4_sub5:
+ write_define(Slot, Reg, 2, MRI, SIRI, os);
+ write_define(Slot, Reg, 3, MRI, SIRI, os);
+ write_define(Slot, Reg, 4, MRI, SIRI, os);
+ write_define(Slot, Reg, 5, MRI, SIRI, os);
+ break;
+ case AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7:
+ write_define(Slot, Reg, 0, MRI, SIRI, os);
+ write_define(Slot, Reg, 1, MRI, SIRI, os);
+ write_define(Slot, Reg, 2, MRI, SIRI, os);
+ write_define(Slot, Reg, 3, MRI, SIRI, os);
+ write_define(Slot, Reg, 4, MRI, SIRI, os);
+ write_define(Slot, Reg, 5, MRI, SIRI, os);
+ write_define(Slot, Reg, 6, MRI, SIRI, os);
+ write_define(Slot, Reg, 7, MRI, SIRI, os);
+ break;
+ }
+ }
+}
+
+void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ raw_ostream &os) {
+
+ for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+ auto Reg = Register::index2VirtReg(i);
+
+ for (MachineOperand &MO : MRI.def_operands(Reg)) {
+ write_define(MO, SlotIndexes, MRI, SIRI, os);
+ }
+ }
+}
+
+void write_uses(MachineFunction &MF, const SlotIndexes *SlotIndexes,
+
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ raw_ostream &os) {
+
+ for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+ auto Reg = Register::index2VirtReg(i);
+
+ for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+ // TODO: create write_use if use has more info.
+ write_define(MO, SlotIndexes, MRI, SIRI, os);
+ }
+ }
+}
+
+void write_liveness(SlotIndex Slot, GCNRPTracker::LiveRegSet &LiveSet,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ raw_ostream &os) {
+ os << "{";
+ auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+ json_pair("slot_index", writeSlot, os);
+
+ os << ",";
+
+ auto writeRegs = [&LiveSet, &MRI, &SIRI, &os]() {
+ for (auto it : LiveSet) {
+ unsigned Reg = it.first;
+ LaneBitmask Mask = it.second;
+ write_live(Reg, Mask, MRI, SIRI, os);
+ }
+ };
+ json_array("regs", writeRegs, os);
+ os << "\n},\n";
+}
+
+void write_segment(const LiveInterval::Segment &S, raw_ostream &os) {
+ os << "{";
+ auto writeBegin = [&S, &os]() { S.start.print(os); };
+
+ json_pair("begin", writeBegin, os);
+
+ os << ",";
+
+ auto writeEnd = [&S, &os]() { S.end.print(os); };
+
+ json_pair("end", writeEnd, os);
+
+ os << ",";
+
+ auto writeValNum = [&S, &os]() {
+ if (S.valno)
+ os << S.valno->id;
+ else
+ os << 0xFFFFFFFF;
+ };
+
+ json_pair("val_num", writeValNum, os);
+
+ os << "},\n";
+}
+
+void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &os) {
+ os << "{\n";
+ auto writeMask = [&SR, &os]() { os << SR.LaneMask.getAsInteger(); };
+
+ json_pair("mask", writeMask, os);
+
+ os << ",";
+
+ // Segments.
+ auto writeSegments = [&SR, &os]() {
+ for (auto &S : SR.segments) {
+ write_segment(S, os);
+ }
+ };
+
+ json_array("segments", writeSegments, os);
+
+ os << "\n},\n";
+}
+
+void write_live_interval(LiveInterval &LI, const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, raw_ostream &os) {
+ os << "{\n";
+
+ auto writeReg = [&LI, &MRI, &SIRI, &os]() {
+ write_reg(LI.reg(), 0, MRI, SIRI, os);
+ };
+
+ json_obj_pair("reg", writeReg, os);
+
+ os << ",";
+
+ auto writeSegments = [&LI, &os]() {
+ for (auto &S : LI.segments) {
+ write_segment(S, os);
+ }
+ };
+
+ json_array("segments", writeSegments, os);
+
+ os << ",";
+
+ auto writeSubRanges = [&LI, &os]() {
+ for (auto &SR : LI.subranges()) {
+ write_subrange(SR, os);
+ }
+ };
+
+ json_array("subranges", writeSubRanges, os);
+
+ os << "},\n";
+}
+
+std::string get_legal_str(const MDString *MDStr) {
+ std::string str;
+ raw_string_ostream Stream(str);
+ MDStr->print(Stream);
+ Stream.flush();
+ // Remove !.
+ str = str.substr(1);
+ // Remove ""
+ str = str.substr(1);
+ str.pop_back();
+ std::replace(str.begin(), str.end(), '\\', '#');
+ return str;
+}
+
+void write_file(const MDNode *FileNode, raw_ostream &os) {
+ const MDString *FileName = cast<MDString>(FileNode->getOperand(0).get());
+ StringRef fileNameStr = FileName->getString();
+ if (fileNameStr.find("__AMDGPU_GPUMAP_") == 0)
+ return;
+ if (fileNameStr.find("__AMDGPU_DWARF_") == 0)
+ return;
+
+ os << "{";
+
+ std::string str0 = get_legal_str(FileName);
+ auto writeName = [&str0, &os]() { os << str0; };
+ json_pair("filename", writeName, os);
+
+ os << ",\n";
+
+ const MDString *Content = cast<MDString>(FileNode->getOperand(1).get());
+ std::string str = get_legal_str(Content);
+ auto writeContent = [&str, &os]() { os << str; };
+ json_pair("content", writeContent, os);
+ os << "\n},\n";
+}
+
+void write_DIFile(const DIFile *File, raw_ostream &os) {
+ if (File) {
+ std::string name = get_legal_str(File->getRawFilename());
+ std::string dir = "";
+ if (MDString *MDDir = File->getRawDirectory())
+ dir = get_legal_str(MDDir);
+ os << dir << name;
+ } else {
+ os << "ArtificialFile";
+ }
+}
+
+void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &os) {
+ os << "{";
+
+ auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+ json_pair("slot_index", writeSlot, os);
+
+ os << ",\n";
+
+ MDNode *Scope = DL.getScope();
+ unsigned line = DL.getLine();
+ unsigned col = DL.getCol();
+
+ auto writeLine = [&line, &os]() { os << line; };
+ json_pair("line", writeLine, os);
+
+ os << ",\n";
+
+ auto writeCol = [&col, &os]() { os << col; };
+ json_pair("col", writeCol, os);
+
+ os << ",\n";
+
+ auto writeFile = [&Scope, &os]() {
+ const DIFile *File = cast<DIScope>(Scope)->getFile();
+ write_DIFile(File, os);
+ };
+ json_pair("file", writeFile, os);
+
+ if (DILocation *inlineDL = DL.getInlinedAt()) {
+ os << ",\n";
+ unsigned inlineLine = inlineDL->getLine();
+ auto writeLine = [&inlineLine, &os]() { os << inlineLine; };
+ json_pair("inline_line", writeLine, os);
+
+ os << ",\n";
+
+ unsigned inlineCol = inlineDL->getColumn();
+ auto writeCol = [&inlineCol, &os]() { os << inlineCol; };
+ json_pair("inline_col", writeCol, os);
+
+ os << ",\n";
+
+ const MDNode *InlineScope = DL.getInlinedAtScope();
+ auto writeFile = [&InlineScope, &os]() {
+ const DIFile *File = cast<DIScope>(InlineScope)->getFile();
+ write_DIFile(File, os);
+ };
+ json_pair("inline_file", writeFile, os);
+ }
+
+ os << "\n},\n";
+}
+
+void write_dbg_val(unsigned Reg, const DIVariable *V, const DIExpression *Exp,
+ const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ raw_ostream &os) {
+ os << "{";
+
+ auto writeReg = [&MRI, &SIRI, &Reg, &os]() {
+ const unsigned SubReg = 0;
+ write_reg(Reg, SubReg, MRI, SIRI, os);
+ };
+ json_obj_pair("reg", writeReg, os);
+
+ os << ",\n";
+
+ if (V) {
+ auto writeName = [&V, &os]() { os << V->getName(); };
+ json_pair("debug_val_name", writeName, os);
+ os << ",\n";
+
+ auto writeFile = [&V, &os]() {
+ const DIFile *File = V->getFile();
+ write_DIFile(File, os);
+ };
+ json_pair("debug_val_file", writeFile, os);
+ os << ",\n";
+
+ auto writeLine = [&V, &os]() { os << V->getLine(); };
+ json_pair("debug_val_line", writeLine, os);
+ }
+
+ if (Exp->isValid() && Exp->getNumElements()) {
+ os << ",\n";
+ auto writeV = [&Exp, &os]() {
+ os << '[';
+ bool NeedSep = false;
+ for (auto Op : Exp->expr_ops()) {
+ if (NeedSep)
+ os << ", ";
+ else
+ NeedSep = true;
+ os << dwarf::OperationEncodingString(Op.getOp());
+ for (unsigned I = 0; I < Op.getNumArgs(); ++I)
+ os << ' ' << Op.getArg(I);
+ }
+ os << "] ";
+ };
+ json_pair("debug_exp", writeV, os);
+ }
+ os << "\n},\n";
+}
+
+void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI, const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI, const SlotIndexes *SlotIndexes,
+ const NamedMDNode *SourceMD, raw_ostream &os) {
+ os << ",\n";
+
+ auto writeFiles = [&SourceMD, &os]() {
+ for (const MDNode *FileNode : SourceMD->operands()) {
+ write_file(FileNode, os);
+ }
+ };
+
+ json_array("files", writeFiles, os);
+
+ os << ",\n";
+
+ auto writeLineMapping = [&MF, &SlotIndexes, &os]() {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugInstr()) {
+ continue;
+ }
+ const DebugLoc DL = MI.getDebugLoc();
+ if (!DL)
+ continue;
+ SlotIndex Slot = SlotIndexes->getInstructionIndex(MI);
+ write_line_mapping(Slot, DL, os);
+ }
+ }
+ };
+
+ json_array("line_mapping", writeLineMapping, os);
+
+ os << ",\n";
+
+ auto writeDebugVals = [&MF, &MRI, &SIRI, &os]() {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (!MI.isDebugValue())
+ continue;
+
+ MachineOperand &Reg = MI.getOperand(0);
+ if (!Reg.isReg())
+ continue;
+
+ if (Reg.getReg() == 0)
+ continue;
+
+ const DIVariable *V = MI.getDebugVariable();
+ const DIExpression *Exp = MI.getDebugExpression();
+ write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, os);
+ }
+ }
+ };
+
+ json_array("debug_vals", writeDebugVals, os);
+}
+
+void write_function(MachineFunction &MF, LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI, const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI, raw_ostream &os) {
+ const SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
+
+ os << "{\n";
+ auto writeName = [&MF, &os]() { os << MF.getName(); };
+ json_pair("name", writeName, os);
+
+ os << ",\n";
+
+ auto writeBlocks = [&MF, &SlotIndexes, &LIS, &MRI, &SIRI, &SIII, &os]() {
+ for (MachineBasicBlock &MBB : MF) {
+ write_block(MBB, LIS, MRI, SIRI, SIII, os);
+ os << ",\n";
+ }
+ };
+
+ json_array("blocks", writeBlocks, os);
+
+ os << ",\n";
+
+ auto writeDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() {
+ write_defines(MF, SlotIndexes, MRI, SIRI, os);
+ };
+
+ json_array("defines", writeDefines, os);
+
+ os << ",\n";
+
+ auto writeUses = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() {
+ write_uses(MF, SlotIndexes, MRI, SIRI, os);
+ };
+
+ json_array("uses", writeUses, os);
+
+ os << ",\n";
+
+ auto writeLiveness = [&MF, &LIS, &MRI, &SIRI, &os]() {
+ for (MachineBasicBlock &MBB : MF)
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ const SlotIndex &SI = LIS->getInstructionIndex(MI).getBaseIndex();
+ GCNRPTracker::LiveRegSet LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
+ write_liveness(SI, LISLR, MRI, SIRI, os);
+ }
+ };
+
+ json_array("liveness", writeLiveness, os);
+
+ os << ",\n";
+
+ auto writeLiveIntervals = [&MRI, &SIRI, &LIS, &os]() {
+ for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+ auto Reg = Register::index2VirtReg(i);
+ if (!LIS->hasInterval(Reg))
+ continue;
+ auto &LI = LIS->getInterval(Reg);
+ write_live_interval(LI, MRI, SIRI, os);
+ }
+ };
+
+ json_array("live_intervals", writeLiveIntervals, os);
+
+#if 0 // TODO: Do we need this?
+ // Check debug info.
+ const Function &F = MF.getFunction();
+ const Module *M = F.getParent();
+ const NamedMDNode *SourceMD =
+ M->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceContentsMDName);
+ if (SourceMD) {
+ write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, os);
+ }
+#endif
+
+ os << "\n}";
+}
+
+void write_pressure(MachineFunction &MF, LiveIntervals *LIS,
+ const char *Filename) {
+ int FD = -1;
+ SmallString<128> TmpFilename(Filename);
+ std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename);
+ if (EC) {
+ errs() << "Error: " << EC.message() << "\n";
+ return;
+ }
+
+ raw_fd_ostream O(FD, /*shouldClose=*/true);
+
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+ const auto *SIII = ST->getInstrInfo();
+ const auto *SIRI = ST->getRegisterInfo();
+ auto &MRI = MF.getRegInfo();
+ write_function(MF, LIS, MRI, SIII, SIRI, O);
+ O.flush();
+ O.close();
+}
+
+void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) {
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+ const auto *SIII = ST->getInstrInfo();
+ const auto *SIRI = ST->getRegisterInfo();
+ auto &MRI = MF.getRegInfo();
+ write_function(MF, LIS, MRI, SIII, SIRI, os);
+ os.flush();
+}
+
+} // namespace pressure
+}// namespace llvm
+
+namespace {
+class ContributionList {
+public:
+ ContributionList(MachineFunction &MF) : MF(MF){};
+ void build();
+ bool propagateContribution();
+ MachineFunction &MF;
+ DenseMap<MachineInstr *, unsigned> MIIndexMap;
+ // Set of inst which contribute to build the key MachineInstr.
+ DenseMap<MachineInstr *, DenseSet<MachineInstr *>> MIContributorMap;
+ // Set of inst which been contributed by the key MachineInstr.
+ DenseMap<MachineInstr *, DenseSet<MachineInstr *>> MIContributedToMap;
+ void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &os);
+ void writeBlock(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
+ raw_ostream &os);
+ void write(raw_ostream &os);
+};
+
+void buildMIContribution(MachineInstr &MI,
+ DenseSet<MachineInstr *> &ContributorSet,
+ DenseSet<MachineInstr *> &ContributedSet,
+ const SIRegisterInfo &SIRI, MachineRegisterInfo &MRI) {
+ for (MachineOperand &UseMO : MI.uses()) {
+ if (!UseMO.isReg())
+ continue;
+ Register Reg = UseMO.getReg();
+ if (Reg.isPhysical())
+ continue;
+ if (UseMO.isImplicit()) {
+ // if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+ // Reg == AMDGPU::SCC)
+ continue;
+ }
+ for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+ ContributorSet.insert(&DefMI);
+ }
+ }
+
+ for (MachineOperand &DstMO : MI.defs()) {
+ if (!DstMO.isReg())
+ continue;
+ if (DstMO.isImplicit())
+ continue;
+ Register Reg = DstMO.getReg();
+ if (Reg.isPhysical())
+ continue;
+ for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+ ContributedSet.insert(&UseMI);
+ }
+ }
+}
+
+bool ContributionList::propagateContribution() {
+ bool bUpdated = false;
+ ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+ for (auto *MBB : RPOT) {
+ for (auto &MI : *MBB) {
+ auto &contributors = MIContributorMap[&MI];
+ unsigned size = contributors.size();
+ DenseSet<MachineInstr *> parentContributors;
+ for (auto *CMI : contributors) {
+ auto &pContributors = MIContributorMap[CMI];
+ parentContributors.insert(pContributors.begin(), pContributors.end());
+ }
+ contributors.insert(parentContributors.begin(), parentContributors.end());
+ bUpdated |= size < contributors.size();
+ }
+ }
+ return bUpdated;
+}
+
+void ContributionList::build() {
+ // Build contribution.
+ auto &MRI = MF.getRegInfo();
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+ const auto *SIRI = ST->getRegisterInfo();
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ auto &contributors = MIContributorMap[&MI];
+ auto &contributed = MIContributedToMap[&MI];
+ buildMIContribution(MI, contributors, contributed, *SIRI, MRI);
+ }
+ }
+ // propagate contribution.
+ bool bUpdated = true;
+ while (bUpdated) {
+ bUpdated = propagateContribution();
+ }
+}
+
+void ContributionList::writeInst(MachineInstr &MI, const SIInstrInfo *SIII,
+ raw_ostream &os) {
+ os << "\n{\n";
+ unsigned ID = MIIndexMap[&MI];
+ auto writeSlot = [&ID, &os]() { os << ID; };
+
+ json_pair("ID", writeSlot, os);
+
+ os << ",";
+
+ auto writeAsm = [&MI, &SIII, &os]() {
+ MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false,
+ /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII);
+ };
+ json_pair("asm", writeAsm, os);
+
+ os << ",\n";
+
+ auto &contributors = MIContributorMap[&MI];
+ auto writeContributor = [&contributors, this, &os]() {
+ for (auto *MI : contributors) {
+ unsigned ID = MIIndexMap[MI];
+ os << ID << ",";
+ }
+ };
+
+ json_array("contributors", writeContributor, os);
+ os << ",\n";
+
+ auto &contributeds = MIContributedToMap[&MI];
+ auto writeContributed = [&contributeds, this, &os]() {
+ for (auto *MI : contributeds) {
+ unsigned ID = MIIndexMap[MI];
+ os << ID << ",";
+ }
+ };
+
+ json_array("contributed", writeContributed, os);
+ os << "\n}\n";
+}
+
+void ContributionList::writeBlock(MachineBasicBlock &MBB,
+ const SIInstrInfo *SIII, raw_ostream &os) {
+ os << "{\n";
+ auto writeName = [&MBB, &os]() { os << MBB.getName(); };
+ json_pair("name", writeName, os);
+
+ os << ",";
+
+ auto writeIndex = [&MBB, &os]() { os << MBB.getNumber(); };
+ json_pair("id", writeIndex, os);
+
+ os << ",\n";
+
+ auto writeInsts = [this, &MBB, &SIII, &os]() {
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ writeInst(MI, SIII, os);
+ os << ",\n";
+ }
+ };
+
+ json_array("instructions", writeInsts, os);
+
+ os << ",\n";
+
+ auto writePreds = [&MBB, &os]() {
+ for (MachineBasicBlock *Pred : MBB.predecessors()) {
+ os << Pred->getNumber() << ",";
+ }
+ };
+
+ json_array("preds", writePreds, os);
+
+ os << ",";
+
+ auto writeSuccs = [&MBB, &os]() {
+ for (MachineBasicBlock *Succ : MBB.successors()) {
+ os << Succ->getNumber() << ",";
+ }
+ };
+
+ json_array("succs", writeSuccs, os);
+
+ os << "}";
+}
+
+void ContributionList::write(raw_ostream &os) {
+ unsigned ID = 0;
+ // Build ID for write.
+ ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+ for (auto *MBB : RPOT) {
+ for (auto &MI : *MBB) {
+ MIIndexMap[&MI] = ID++;
+ }
+ }
+
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+ const auto *SIII = ST->getInstrInfo();
+
+ os << "{\n";
+ auto writeName = [this, &os]() { os << MF.getName(); };
+ json_pair("name", writeName, os);
+
+ os << ",\n";
+
+ auto writeBlocks = [this, &SIII, &RPOT, &os]() {
+ for (auto *MBB : RPOT) {
+ writeBlock(*MBB, SIII, os);
+ os << ",\n";
+ }
+ };
+
+ json_array("blocks", writeBlocks, os);
+
+ os << "\n}";
+}
+} // namespace
+
+namespace llvm {
+
+void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) {
+ int FD = -1;
+ SmallString<128> TmpFilename(Filename);
+ std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename);
+ if (EC) {
+ errs() << "Error: " << EC.message() << "\n";
+ return;
+ }
+
+ raw_fd_ostream O(FD, /*shouldClose=*/true);
+ ContributionList CL(MF);
+ CL.build();
+
+ CL.write(O);
+
+ O.flush();
+ O.close();
+}
+} // namespace llvm
+
+static bool IsPhysReg(const MachineOperand &Op)
+{
+ return Op.isReg() && Op.getReg().isPhysical();
+}
+
+// Sometimes split bb uses physical registers defined in BB, have to add them to
+// live-in or the ir is malformed.
+void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI)
+{
+ // Initialize with current set of liveins. For new blocks this will be empty.
+ SmallDenseSet<unsigned, 8> DefSet;
+ for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins())
+ {
+ DefSet.insert(P.PhysReg);
+ }
+
+ for (auto &MI : *NewBB)
+ {
+ // Add all undefined physical registers to the live in set.
+ for (MachineOperand &Use : MI.operands())
+ {
+ // Only process physreg uses.
+ if (!IsPhysReg(Use) || !Use.isUse()) continue;
+
+ // Reserved regs do not need to be tracked through live-in sets.
+ unsigned Reg = Use.getReg();
+ if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue;
+
+ if (!DefSet.count(Reg))
+ NewBB->addLiveIn(Reg);
+ }
+
+ // Add all physical register defs (exlicit+implicit) to the def register set.
+ for (MachineOperand &Def : MI.operands())
+ {
+ // Only process physreg defs.
+ if (!IsPhysReg(Def) || !Def.isDef()) continue;
+ DefSet.insert(Def.getReg());
+ }
+ }
+}
+
+void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
+ SmallDenseSet<unsigned, 8> &LiveOutSet,
+ const MachineRegisterInfo *MRI) {
+ for (auto rit = NewBB->rbegin(); rit != NewBB->rend(); rit++) {
+ auto &MI = *rit;
+ // Add all physical register defs (exlicit+implicit) to the def register
+ // set.
+ for (MachineOperand &Def : MI.operands()) {
+ // Only process physreg defs.
+ if (!IsPhysReg(Def) || !Def.isDef())
+ continue;
+ LiveOutSet.erase(Def.getReg());
+ }
+ // Add all undefined physical registers to the live in set.
+ for (MachineOperand &Use : MI.operands()) {
+ // Only process physreg uses.
+ if (!IsPhysReg(Use) || !Use.isUse())
+ continue;
+
+ // Reserved regs do not need to be tracked through live-in sets.
+ unsigned Reg = Use.getReg();
+ if (Use.isImplicit() && MRI && MRI->isReserved(Reg))
+ continue;
+
+ if (!LiveOutSet.count(Reg))
+ LiveOutSet.insert(Reg);
+ }
+ }
+ for (unsigned Reg : LiveOutSet) {
+ NewBB->addLiveIn(Reg);
+ }
+}
+
+MachineReg llvm::CreateVirtualRegForOperand(
+ MachineOpcode Opcode,
+ unsigned OpNum,
+ MachineFunction &MF
+)
+{
+ const TargetSubtargetInfo &ST = MF.getSubtarget();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetInstrInfo *TII = ST.getInstrInfo();
+ const MCInstrDesc &Desc = TII->get(Opcode);
+ const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
+ if (!RC)
+ {
+ llvm::report_fatal_error("Unable to create virtual reg for instruction operand");
+ }
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ return MRI.createVirtualRegister(RC);
+}
+
+MachineReg llvm::CreateVirtualDstReg(
+ MachineOpcode Opcode,
+ MachineFunction &MF
+)
+{
+ return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
+}
+
+// Return true if the MI is a copy of exec.
+// If true then sets pDst to the destination register.
+bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
+{
+ enum {DST=0, SRC=1};
+ bool FoundCopy = false;
+ if (MI.getOpcode() == AMDGPU::COPY
+ || MI.getOpcode() == AMDGPU::S_MOV_B32
+ || MI.getOpcode() == AMDGPU::S_MOV_B64)
+ {
+ const MachineOperand &Src = MI.getOperand(SRC);
+ if (Src.isReg() && Src.getReg() == Exec)
+ {
+ FoundCopy = true;
+ }
+ }
+#if 0 // TODO: Delete this.
+ else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO ||
+ MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32)
+ {
+ FoundCopy = true;
+ }
+#endif
+
+ if (FoundCopy)
+ {
+ *pDst = MI.getOperand(DST).getReg();
+ }
+
+ return FoundCopy;
+}
+
+llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF)
+{
+ llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister};
+ if (MachineInstr* MI = GetWqmEntryActiveMaskInst(MF))
+ {
+ LiveLaneMask.Reg = MI->getOperand(0).getReg();
+ LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
+ }
+
+ return LiveLaneMask;
+}
+
+MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
+{
+#if 0 // TODO: Get rid of this
+ // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction.
+ // This instruction is added by the SIWholeQuadMode pass.
+ MachineBasicBlock &MBB = MF.front();
+ for (MachineInstr &MI : MBB)
+ {
+ if (MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK ||
+ MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK_32)
+ {
+ return &MI;
+ }
+ }
+#endif
+
+ return nullptr;
+}
+
+bool llvm::IsFetchShaderCall(const MachineInstr *MI)
+{
+#if 0 // TODO: Get rid of this.
+ return
+ MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER ||
+ MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall);
+#else
+ return false;
+#endif
+}
+
+bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) {
+ const TargetRegisterInfo* TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+ for (auto it = MI; it != MBB->end(); ++it) {
+ const MachineInstr &CurMI = *it;
+ // Hit use of scc, it is live.
+ if (CurMI.readsRegister(AMDGPU::SCC, TRI))
+ return true;
+ // Hit def of scc first, not live.
+ if (CurMI.definesRegister(AMDGPU::SCC, TRI))
+ return false;
+ }
+ // Reach the end of MBB, check live-ins of MBB successors.
+ for (const MachineBasicBlock *Succ : MBB->successors()) {
+ if (Succ->isLiveIn(AMDGPU::SCC))
+ return true;
+ }
+ return false;
+}
+
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
+ MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator MI,
+ const TargetRegisterInfo* TRI,
+ const SIInstrInfo* TII,
+ MachineRegisterInfo* MRI,
+ SccDefInsertPointConstraintFlags Constraints
+)
+{
+ // If SCC is dead at MI when we can use MI as the insert point.
+ if (!llvm::IsSccLiveAt(MBB, MI))
+ {
+ return MI;
+ }
+
+ const bool CheckForExecWrite =
+ Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+
+ // Get the starting reverse iterator taking care to handle the MBB->end() case.
+ MachineBasicBlock::reverse_iterator Start;
+ if (MI == MBB->end())
+ {
+ Start = MBB->rbegin();
+ }
+ else
+ {
+ Start = MI.getReverse();
+ }
+
+ // Otherwise, walk backwards through the block looking for a location where
+ // SCC is dead.
+ for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); It != End; ++It)
+ {
+ // If the instruction modifies exec then we cannot use it as
+ // an insertion point (if that is a constraint from the caller).
+ // The check for EXEC works for both wave64 and wave32 because
+ // it will also catch writes to the subregisters (e.g. exec_lo).
+ if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
+ {
+ break;
+ }
+
+ if (It->modifiesRegister(AMDGPU::SCC, TRI)
+ && !It->readsRegister(AMDGPU::SCC, TRI))
+ {
+ return It->getIterator();
+ }
+ }
+
+ // If no safe location can be found in the block we can save and restore
+ // SCC around MI. There is no way to directly read or write SCC so we use
+ // s_cselect to read the current value of SCC and s_cmp to write the saved
+ // value back to SCC.
+ //
+ // The generated code will look like this;
+ //
+ // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC
+ // <----- Newly created safe insert point.
+ // MI
+ // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC
+ //
+ unsigned int TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+ .addImm(-1)
+ .addImm(0);
+ BuildMI(*MBB, std::next(MI->getIterator()), DL, TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(TmpScc, RegState::Kill)
+ .addImm(0);
+
+ return MI;
+}
+
+
+namespace {
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+ MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start);
+ MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end);
+ // Treat non inst as not local.
+ if (!startMI || !endMI)
+ return false;
+ // is local when parent MBB the same.
+ bool bSameMBB = startMI->getParent() == endMI->getParent();
+ if (!bSameMBB)
+ return false;
+ // Collect touched MBB.
+ MachineBasicBlock *MBB = startMI->getParent();
+ touchedMBBSet.insert(MBB);
+ return true;
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+ for (const LiveRange::Segment &Seg : Range->segments) {
+ if (!isLocalSegment(&Seg, Indexes, touchedMBBSet))
+ return false;
+ }
+ return true;
+}
+
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) {
+ MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start);
+ MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end);
+ // Treat non inst as not local.
+ if (!startMI || !endMI)
+ return false;
+ // is local when parent MBB the same.
+ return startMI->getParent() == endMI->getParent();
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
+ for (const LiveRange::Segment &Seg : Range->segments) {
+ if (!isLocalSegment(&Seg, Indexes))
+ return false;
+ }
+ return true;
+}
+
+} // namespace
+
+// In case like float4 v, v.x used and defined in one block, v.y used and define
+// in another block, one live interval could touch more than one MBB.
+// touchedMBBSet is used for scheduling where local live interval could cross
+// multiple regions, need to calculate livereg for each region inside touched
+// MBB.
+bool llvm::isLocalLiveInterval(
+ const LiveInterval &LI, SlotIndexes *Indexes,
+ SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges()) {
+ if (!isLocalLiveRange(&S, Indexes, touchedMBBSet))
+ return false;
+ }
+ }
+ return isLocalLiveRange(&LI, Indexes, touchedMBBSet);
+}
+
+
+bool llvm::isLocalLiveInterval(
+ const LiveInterval &LI, SlotIndexes *Indexes) {
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges()) {
+ if (!isLocalLiveRange(&S, Indexes))
+ return false;
+ }
+ }
+ return isLocalLiveRange(&LI, Indexes);
+}
+
+// This is used to speed up reg pressure calculation.
+// If instruction is moved, the cached liveset will be out of date.
+// Before instruction is moved, the value will be correct.
+void llvm::buildEndLiveMap(
+ llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
+ const llvm::MachineRegisterInfo &MRI,
+ llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
+ &MBBLiveMap, bool After) {
+ // When only have one block, end live reg must be empty.
+ if (MF.size() == 1)
+ return;
+ auto *SlotIndexes = LIS->getSlotIndexes();
+ DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+ for (MachineBasicBlock &MBB : MF) {
+ auto BBEnd = MBB.rbegin();
+
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) {
+ auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+ MBBOutputSlotMap[&MBB] = After ? SI.getDeadSlot() : SI.getBaseIndex();
+ }
+ }
+
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ auto Reg = Register::index2VirtReg(I);
+ if (!LIS->hasInterval(Reg))
+ continue;
+
+ LaneBitmask LiveMask;
+ const auto &LI = LIS->getInterval(Reg);
+
+ // Skip local live interval to make live input/ouput faster.
+ if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+ continue;
+
+ for (auto outputIt : MBBOutputSlotMap) {
+ MachineBasicBlock *MBB = outputIt.first;
+ auto SI = outputIt.second;
+
+ auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+ if (LiveMask.any())
+ MBBLiveMap[MBB][Reg] = LiveMask;
+ }
+ }
+}
+
+unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+ auto &MRI = MF.getRegInfo();
+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ return SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::VGPR0) + 1;
+ }
+ }
+ return 0;
+}
+
+unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned MaxSGPR = 0;
+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ // Skip scratch reserved reg, which is a big register that don't really contribute to this stat.
+ if (ScratchRSrcReg != 0) {
+ if (SIRI->isSubRegister(ScratchRSrcReg, Reg))
+ continue;
+ }
+ MaxSGPR = SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::SGPR0);
+ break;
+ }
+ }
+ return 1 + llvm::RegForVCC + MaxSGPR;
+}
+
+void llvm::dumpLiveSet(const LiveSet &LiveSet,
+ const SIRegisterInfo *SIRI) {
+
+ dbgs() << "\n live set: \n";
+ for (auto it : LiveSet) {
+ int Reg = it.first;
+ dbgs() << printReg(Reg, SIRI);
+ if (it.second.any()) {
+ dbgs() << " mask:" << it.second.getAsInteger();
+ }
+ dbgs() << "\n";
+ }
+}
+
+// Test if all fast math flags of this Machine Instr are set. This allows
+// all non-strict floating-point transforms.
+bool llvm::isFastMathInst(llvm::MachineInstr &MI) {
+ // Follow the checks in isFast() in SelectionDAGNodes.h
+ return MI.getFlag(llvm::MachineInstr::MIFlag::FmNsz) &&
+ MI.getFlag(llvm::MachineInstr::MIFlag::FmArcp) &&
+ MI.getFlag(llvm::MachineInstr::MIFlag::FmNoNans) &&
+ MI.getFlag(llvm::MachineInstr::MIFlag::FmNoInfs) &&
+ MI.getFlag(llvm::MachineInstr::MIFlag::FmContract) &&
+ MI.getFlag(llvm::MachineInstr::MIFlag::FmAfn) &&
+ MI.getFlag(llvm::MachineInstr::MIFlag::FmReassoc);
+}
+#if 0
+bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage)
+{
+ switch (Stage)
+ {
+ case xmd::HwStage::PS:
+ case xmd::HwStage::CS:
+ return true;
+ default:
+ return false;
+ }
+}
+#endif
+
+MachineBasicBlock::succ_iterator llvm::FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ)
+{
+ for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); It != End; ++It)
+ {
+ if (*It == Succ)
+ {
+ return It;
+ }
+ }
+
+ return MBB->succ_end();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
new file mode 100644
index 000000000000000..16b55c5c945835f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+class MachineFunction;
+class LiveIntervals;
+class LiveInterval;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class SIInstrInfo;
+class MachineInstr;
+class MachinePostDominatorTree;
+class MachineLoopInfo;
+class MachineDominatorTree;
+class raw_ostream;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
+typedef unsigned MachineReg;
+typedef unsigned MachineOpcode;
+
+constexpr unsigned RegForVCC = 2;
+constexpr unsigned VGPR_LIMIT = 256;
+// Post RA remat only try to help case when pressue is OK before RA but RA
+// result is higher. The diff should not be too much. So just use 4 as threshold
+// here.
+constexpr unsigned PostRARematThreshHold = 4;
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI);
+void CollectLiveSetPressure(
+ const LiveSet &liveSet,
+ const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
+ unsigned &VPressure, unsigned &SPressure);
+
+bool isExecUpdateForControlFlow(llvm::MachineInstr &MI);
+
+bool IsSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
+
+llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO,
+ const llvm::MachineRegisterInfo &MRI);
+void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+llvm::MachineBasicBlock *split(llvm::MachineInstr *I);
+
+// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only
+// used 4 lanes.
+bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *TRI,
+ const llvm::SIInstrInfo *TII,
+ llvm::SlotIndexes *SlotIndexes);
+
+bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT,
+ llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
+ llvm::MachineBasicBlock *ToBB);
+
+
+void viewCFGWithPhi(llvm::MachineFunction &MF);
+void write_contribution_list(llvm::MachineFunction &MF, const char *Filename);
+
+llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII);
+
+bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
+ llvm::MachineBasicBlock &MBB);
+
+void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI);
+
+void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
+ llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
+ const llvm::MachineRegisterInfo *MRI);
+
+MachineReg CreateVirtualRegForOperand(
+ MachineOpcode Opcode,
+ unsigned Operand,
+ llvm::MachineFunction &MF
+);
+
+MachineReg CreateVirtualDstReg(
+ MachineOpcode Opcode,
+ llvm::MachineFunction &MF
+);
+
+bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst);
+struct MachineRegWithSubReg {
+ MachineReg Reg = AMDGPU::NoRegister;
+ unsigned SubReg = AMDGPU::NoSubRegister;
+};
+MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF);
+llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
+
+// Return true if this machine instruction represents a call to the fetch shader.
+// We curently have two mechanisims for calling fetch shader:
+// 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction
+// 2. A CALL instruction with the `FetchShaderCall` flag set to true.
+bool IsFetchShaderCall(const llvm::MachineInstr* MI);
+
+bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI);
+
+
+// An enum used to pass additional constraints to
+// `FindOrCreateInsertionPointForSccDef()`. This will further
+// constrain the location where the scc def can be inserted.
+enum SccDefInsertPointConstraintFlags
+{
+ None = 0, // No additional constraints.
+ NoExecWrite = 1, // Should be no modification of exec between BeforeInst and insert point.
+};
+
+// Look for a safe place to insert an instruction that defines scc.
+//
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef(
+ llvm::MachineBasicBlock* MBB,
+ llvm::MachineBasicBlock::iterator BeforeInst,
+ const llvm::TargetRegisterInfo* TRI,
+ const llvm::SIInstrInfo* TII,
+ llvm::MachineRegisterInfo* MRI,
+ SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None
+);
+
+// Check if LI live cross basic blocks, save all touched basic block if is
+// local.
+bool isLocalLiveInterval(
+ const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
+ llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &touchedMBBSet);
+bool isLocalLiveInterval(
+ const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes);
+
+// build liveRegSet at end of each MBB.
+void buildEndLiveMap(
+ llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
+ const llvm::MachineRegisterInfo &MRI,
+ llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
+ &MBBLiveMap, bool After);
+
+void dumpLiveSet(const LiveSet &LiveSet,
+ const llvm::SIRegisterInfo *SIRI);
+
+unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+
+bool isFastMathInst(llvm::MachineInstr &MI);
+
+namespace pressure {
+void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI,
+ llvm::raw_ostream &os);
+void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
+ const char *Filename);
+void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
+ llvm::raw_ostream &os);
+}
+// bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage);
+
+// Look for the successor `Succ` of the given `MBB`.
+// Returns MBB->succ_end() if `Succ` is not a successor of MBB.
+llvm::MachineBasicBlock::succ_iterator FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ);
+
+// The enum and helper function for v_perm selection mask.
+//
+// The input byte layout of v_perm is as below:
+//
+// BYTE in[8]
+// in[0] = $src1_BYTE0;
+// in[1] = $src1_BYTE1;
+// in[2] = $src1_BYTE2;
+// in[3] = $src1_BYTE3;
+// in[4] = $src0_BYTE0;
+// in[5] = $src0_BYTE1;
+// in[6] = $src0_BYTE2;
+// in[7] = $src0_BYTE3;
+//
+enum class V_PERM_IN_BYTE_POS {
+ src1_BYTE0 = 0,
+ src1_BYTE1,
+ src1_BYTE2,
+ src1_BYTE3,
+ src0_BYTE0,
+ src0_BYTE1,
+ src0_BYTE2,
+ src0_BYTE3
+};
+
+// The 4 arguments specify which input byte will be output
+// out[0] = Sel_0;
+// out[1] = Sel_1;
+// out[2] = Sel_2;
+// out[3] = Sel_3;
+//
+constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0,
+ V_PERM_IN_BYTE_POS Sel_1,
+ V_PERM_IN_BYTE_POS Sel_2,
+ V_PERM_IN_BYTE_POS Sel_3) {
+ return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) |
+ ((int)Sel_1 << 8) | (int)Sel_0);
+}
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
new file mode 100644
index 000000000000000..ceb22b5ff9243dc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
@@ -0,0 +1,2767 @@
+//===- MirDivergenceAnalysis.cpp -- Mir Divergence Analysis Implementation -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is based on Analysis/DivergenceAnalysis.cpp,
+// The most important difference is
+// introduction of the idea of "Bit-Divergence".
+//
+// The way booleans are represented in in AMD GPU is a 64-bit uint in a pair of
+// scalar registers, where each bit represents a boolean value for one lane. If
+// all active lanes have the same bool value (all 1's or all 0's), then we can
+// generate a scalar branch, otherwise we must use exec mask to selectively
+// execute lanes based on the boolean mask. When all values in a boolean mask
+// are the same for all active lanes, we call that mask "bit-uniform",
+// otherwise we call it "bit-divergent". This differs from the normal concept
+// of "uniform" and "divergent", which represents whether the value may be
+// different across the 64 lanes. A "bit-divergent" value is still "uniform" in
+// the sense that it is the same 64-bit value from the perspective of all the
+// lanes, but when used as branch condition, will cause the branch to be
+// divergent, which will cause the uses of any values outside of the control
+// flow region to be divergent.
+//
+// The original DA marks everything including bools as divergent or uniform
+// based on the propagation of divergent sources. However, booleans in AMDGPU
+// are in fact never "divergent". Comparison operations that receive divergent
+// operands instead produce "bit-divergent" or "bit-uniform" 64-bit booleans.
+// Between the definition of any boolean mask and its use (particularly in
+// branches, cndmasks, or anything that specifially consumes booleans), there
+// can be any arbitrary number and types of operations performed on it,
+// including combining it with other boolean masks via bit operations.
+//
+// The XDA algorithm is a modified version of the original DA algorithm to
+// simultaneously propagate regular divergence and bit-divergence.
+//
+// First off, XDA identifies all sources of divergence as well as
+// bit-divergence and adds them to the worklist. Then, just like with LLVM DA,
+// it pops values off of the worklist to propagate (bit-)divergence to all its
+// users, unless the user is always (bit-)uniform when given (bit-)divergent
+// operand. It's possible for a value to be marked as both divergent and
+// bit-divergent, in which case the regular divergence will trump
+// bit-divergence.
+//
+// The important difference in this propagation step is that there are special
+// instructions that when given bit-divergent operands, produce divergent
+// values and vice versa.
+//
+// An example is comparison:
+//
+// v0 = interp ... ; divergent
+// v1 = interp ... ; divergent
+// s[0:1] = v_cmp v0, v1 ; bit-divergent
+//
+// v0 and v1 are both divergent, but when propagating them, the v_cmp (and its
+// result) is bit-divergent value instead of divergent.
+//
+//
+// An example of the reverse:
+//
+// v0 = ... ; uniform
+// s[0:1] = v_cmp v0, v1 ; bit-divergent
+// ...
+// branch s[0:1], label ; divergent!
+// ...
+// v1 = ... ; uniform
+// ...
+//
+// label:
+// v3 = phi v0, v1 ; divergent! because of divergent branch.
+//
+// The boolean value is bit-divergent. When passed to the branch as an operand,
+// the branch becomes divergent, whose sync dependency will be computed as
+// normal to mark the appropriate values divergent (see description in normal
+// DA on how this works).
+//
+// Another difference is in MIR, some branch will be changed into exec update,
+// so only propagate control flow divergent on branch inst will not cover exec
+// control flow.
+// For case like
+// %163:sreg_64_xexec = S_MOV_B64 $exec
+//bb.1:
+//; predecessors: %bb.1, %bb.0
+// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
+// %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+// %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
+// %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
+// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//...
+// $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
+// S_CBRANCH_EXECNZ %bb.1, implicit $exec
+// The ... code after SAVEEXEC will be divergent if %168 is divergent.
+// The PHI should be divergent when %40 is inside the ...
+// To propagate divergent from %168 to the PHI, need to start the propagate from
+// SAVEEXEC which is the control flow by update exec.
+//
+//
+// Original:
+// This file implements a general divergence analysis for loop vectorization
+// and GPU programs. It determines which branches and values in a loop or GPU
+// program are divergent. It can help branch optimizations such as jump
+// threading and loop unswitching to make better decisions.
+//
+// GPU programs typically use the SIMD execution model, where multiple threads
+// in the same execution group have to execute in lock-step. Therefore, if the
+// code contains divergent branches (i.e., threads in a group do not agree on
+// which path of the branch to take), the group of threads has to execute all
+// the paths from that branch with different subsets of threads enabled until
+// they re-converge.
+//
+// Due to this execution model, some optimizations such as jump
+// threading and loop unswitching can interfere with thread re-convergence.
+// Therefore, an analysis that computes which branches in a GPU program are
+// divergent can help the compiler to selectively run these optimizations.
+//
+// This implementation is derived from the Vectorization Analysis of the
+// Region Vectorizer (RV). That implementation in turn is based on the approach
+// described in
+//
+// Improving Performance of OpenCL on CPUs
+// Ralf Karrenberg and Sebastian Hack
+// CC '12
+//
+// This DivergenceAnalysis implementation is generic in the sense that it does
+// not itself identify original sources of divergence.
+// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
+// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
+// (e.g., special variables that hold the thread ID or the iteration variable).
+//
+// The generic implementation propagates divergence to variables that are data
+// or sync dependent on a source of divergence.
+//
+// While data dependency is a well-known concept, the notion of sync dependency
+// is worth more explanation. Sync dependence characterizes the control flow
+// aspect of the propagation of branch divergence. For example,
+//
+// %cond = icmp slt i32 %tid, 10
+// br i1 %cond, label %then, label %else
+// then:
+// br label %merge
+// else:
+// br label %merge
+// merge:
+// %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// The sync dependence detection (which branch induces divergence in which join
+// points) is implemented in the SyncDependenceAnalysis.
+//
+// The current DivergenceAnalysis implementation has the following limitations:
+// 1. intra-procedural. It conservatively considers the arguments of a
+// non-kernel-entry function and the return value of a function call as
+// divergent.
+// 2. memory as black box. It conservatively considers values loaded from
+// generic or local address as divergent. This can be improved by leveraging
+// pointer analysis and/or by modelling non-escaping memory objects in SSA
+// as done in RV.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMirDivergenceAnalysis.h"
+#include "GCNSubtarget.h"
+#include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUAsmUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "SIInstrInfo.h"
+//#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/Support/Debug.h"
+//#include "newbe/cli/newbe_opts.h" // AMDGPU change.
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mir-divergence-analysis"
+
+namespace llvm {
+bool isAMDGPUOpcodeDivergent(class MachineInstr *MI);
+}
+
+//
+// TODO: TableGen these
+//
+bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ // case R600::INTERP_LOAD_P0:
+ // case R600::INTERP_PAIR_XY:
+ // case R600::INTERP_PAIR_ZW:
+ // case R600::INTERP_VEC_LOAD:
+ // case R600::INTERP_XY:
+ // case R600::INTERP_ZW:
+ case AMDGPU::V_WRITELANE_B32:
+
+ case AMDGPU::V_INTERP_MOV_F32:
+ case AMDGPU::V_INTERP_MOV_F32_e64:
+ case AMDGPU::V_INTERP_MOV_F32_e64_vi:
+ case AMDGPU::V_INTERP_MOV_F32_si:
+ case AMDGPU::V_INTERP_MOV_F32_vi:
+ case AMDGPU::V_INTERP_P1LL_F16:
+ case AMDGPU::V_INTERP_P1LL_F16_vi:
+ case AMDGPU::V_INTERP_P1LV_F16:
+ case AMDGPU::V_INTERP_P1LV_F16_vi:
+ case AMDGPU::V_INTERP_P1_F32:
+ case AMDGPU::V_INTERP_P1_F32_16bank:
+ case AMDGPU::V_INTERP_P1_F32_16bank_si:
+ case AMDGPU::V_INTERP_P1_F32_16bank_vi:
+ case AMDGPU::V_INTERP_P1_F32_e64:
+ case AMDGPU::V_INTERP_P1_F32_e64_vi:
+ case AMDGPU::V_INTERP_P1_F32_si:
+ case AMDGPU::V_INTERP_P1_F32_vi:
+ case AMDGPU::V_INTERP_P2_F16:
+ case AMDGPU::V_INTERP_P2_F16_vi:
+ case AMDGPU::V_INTERP_P2_F32:
+ case AMDGPU::V_INTERP_P2_F32_e64:
+ case AMDGPU::V_INTERP_P2_F32_e64_vi:
+ case AMDGPU::V_INTERP_P2_F32_si:
+ case AMDGPU::V_INTERP_P2_F32_vi:
+
+ case AMDGPU::V_MBCNT_HI_U32_B32_e32:
+ case AMDGPU::V_MBCNT_HI_U32_B32_e32_gfx6_gfx7:
+ case AMDGPU::V_MBCNT_HI_U32_B32_e64:
+ case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx10:
+ case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx6_gfx7:
+ case AMDGPU::V_MBCNT_HI_U32_B32_e64_vi:
+ case AMDGPU::V_MBCNT_HI_U32_B32_sdwa:
+ case AMDGPU::V_MBCNT_LO_U32_B32_e32:
+ case AMDGPU::V_MBCNT_LO_U32_B32_e32_gfx6_gfx7:
+ case AMDGPU::V_MBCNT_LO_U32_B32_e64:
+ case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx10:
+ case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx6_gfx7:
+ case AMDGPU::V_MBCNT_LO_U32_B32_e64_vi:
+ case AMDGPU::V_MBCNT_LO_U32_B32_sdwa:
+
+ case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_vi:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx10:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx6_gfx7:
+ case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_vi:
+
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
+ //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_si:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_vi:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_si:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_vi:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_vi:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_nsa_gfx10:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_si:
+ case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_vi:
+
+ case AMDGPU::SI_PS_LIVE:
+
+ case AMDGPU::DS_SWIZZLE_B32:
+ case AMDGPU::DS_SWIZZLE_B32_gfx10:
+ case AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7:
+ case AMDGPU::DS_SWIZZLE_B32_vi:
+
+ return true;
+
+ default:
+ break;
+ }
+ return false;
+}
+
+namespace {
+bool hasImmOperandWithVal(const MachineInstr *MI, uint16_t srcNameIdx,
+ uint16_t srcModNameIdx, uint64_t Val) {
+ unsigned Op = MI->getOpcode();
+ unsigned srcIdx = AMDGPU::getNamedOperandIdx(Op, srcNameIdx);
+ if (srcIdx == -1)
+ return false;
+ const MachineOperand &srcMO = MI->getOperand(srcIdx);
+ if (srcMO.isImm() && srcMO.getImm() == Val) {
+
+ unsigned modIdx = AMDGPU::getNamedOperandIdx(Op, srcModNameIdx);
+ if (modIdx == -1)
+ return true;
+
+ const MachineOperand &modMO = MI->getOperand(modIdx);
+ if (modMO.getImm() == 0)
+ return true;
+ }
+ return false;
+}
+
+bool isConstant(const MachineInstr *MI) {
+ unsigned Op = MI->getOpcode();
+ switch (Op) {
+ default:
+ break;
+ case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64: {
+ // Check special case or -1, which will get result -1.
+ const uint64_t kImm = -1;
+ if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+ AMDGPU::OpName::src0_modifiers, kImm))
+ return true;
+ if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+ AMDGPU::OpName::src1_modifiers, kImm))
+ return true;
+ } break;
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_OR_B64: {
+ // Check special case or -1, which will get result -1.
+ const uint64_t kImm = -1;
+ if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+ AMDGPU::OpName::src0_modifiers, kImm))
+ return true;
+ if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+ AMDGPU::OpName::src1_modifiers, kImm))
+ return true;
+ } break;
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64: {
+ // Check special case and 0, which will get result 0.
+ const uint64_t kImm = 0;
+ if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+ AMDGPU::OpName::src0_modifiers, kImm))
+ return true;
+ if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+ AMDGPU::OpName::src1_modifiers, kImm))
+ return true;
+ } break;
+ }
+ return false;
+}
+
+bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
+ const MachineRegisterInfo &MRI) {
+ const auto *BoolRC = SIRI->getBoolRC();
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isUse())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+ Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO)
+ return true;
+
+ // Check if the written register class overlaps the bool register class.
+ //
+ // Note that this check is insufficent to catch all of the cases where
+ // a "bool" value could be created (for example writing to a register
+ // pair s[0:1], then using s0 as a bool value in wave32).
+ //
+ // The underlying problem is that we have two notions of divergence
+ // (bit divergence and wave divergence) but the algorithm only propagates
+ // wave divergence. The bit divergence is important for bools because it determines
+ // if a branch is uniform or not (and thus catches cases where a uniform value is
+ // used outside of a divergent control flow region). For bool values the
+ // algorithm will treat normally uniform values (i.e. scalar registers) as divergent
+ // in order to try and propagate bit divergence.
+ //
+ // To fix all the possible bugs here I think we need to actually proagate bit
+ // divergence as well as wave divergences. That is a bigger fix and this check should
+ // cover most cases of treating a bool value as divergent.
+ const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+ if (SIRI->getCommonSubClass(BoolRC, RC))
+ return true;
+ }
+ return false;
+}
+
+bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
+ const SIRegisterInfo *SIRI,
+ const MachineRegisterInfo &MRI) {
+ unsigned Op = MI->getOpcode();
+ switch (Op) {
+ default:
+ // Mark all s_inst always uniform except write to bool dst. This doesn't
+ // mean it is bit uniform. When check branch/exec region, will use
+ // isBitUniform. A bool might be sreg, but still divergent, since it is just
+ // put all lanes in one 64/32 bits sreg.
+ if (SIII->isScalarUnit(*MI) && !writeBoolDst(MI, SIRI, MRI) &&
+ !MI->isTerminator())
+ return true;
+ break;
+ //case AMDGPU::AMDGPU_MAKE_UNIFORM:
+ //case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
+ case AMDGPU::V_READFIRSTLANE_B32:
+ case AMDGPU::V_READLANE_B32:
+ //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
+ //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
+ // bool readfirstlane should be 1 bit, which means bit uniform.
+ return true;
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_OR_B64: {
+ // Check special case or -1, which will get result -1.
+ if (isConstant(MI))
+ return true;
+
+ return !writeBoolDst(MI, SIRI, MRI);
+ } break;
+ case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64: {
+ // Check special case or -1, which will get result -1.
+ if (isConstant(MI))
+ return true;
+ } break;
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_AND_B64: {
+ // Check special case and 0, which will get result 0.
+ if (isConstant(MI))
+ return true;
+
+ return !writeBoolDst(MI, SIRI, MRI);
+ } break;
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64: {
+ // Check special case and 0, which will get result 0.
+ if (isConstant(MI))
+ return true;
+ } break;
+ }
+ return false;
+}
+
+bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) {
+ return reg.isPhysical();;
+}
+
+bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
+ return MRI.getRegClass(reg)->getID() == regClassID;
+}
+
+// For input reg of MF, vgpr will be divergent.
+bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+ if (isPhysicalReg(MRI, Reg)) {
+ unsigned vir_reg = MRI.getLiveInVirtReg(Reg);
+ if (SIRI->isVGPR(MRI, vir_reg))
+ return true;
+ } else {
+ if (SIRI->isVGPR(MRI, Reg))
+ return true;
+ }
+ return false;
+}
+
+bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ //if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
+ // return true;
+ if (isAMDGPUOpcodeDivergent(MI))
+ return true;
+
+ if (isAlwaysUniformMI(MI, SIII, SIRI, MRI))
+ return false;
+
+ // If the instruction is neither guaranteed to
+ // be uniform or divergent, check whether any
+ // of its operands are passed in to the shader as
+ // args through vector regs.
+ //
+ // This makes them divergent.
+ for (MachineOperand &op : MI->operands()) {
+ if (!op.isReg())
+ continue;
+ if (op.isDef())
+ continue;
+ unsigned reg = op.getReg();
+ if (MRI.isLiveIn(reg)) {
+ if (isDivergentInputReg(reg, MRI, SIRI))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// For VCC, try to find the nearest define inside same BB.
+const MachineInstr *findPhysicalDefineInSameMBB(const MachineInstr *MI,
+ unsigned PhyReg) {
+ const MachineBasicBlock *MBB = MI->getParent();
+ auto it = MI->getReverseIterator();
+ for (it++; it != MBB->rend(); it++) {
+ const MachineInstr &TmpMI = *it;
+ for (const MachineOperand &DefMO : TmpMI.operands()) {
+ if (!DefMO.isReg())
+ continue;
+ if (DefMO.isUse())
+ continue;
+ if (DefMO.getReg() == PhyReg)
+ return &TmpMI;
+ }
+ }
+ return nullptr;
+}
+
+bool isWriteExec(const MachineInstr *MI) {
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isUse())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == AMDGPU::EXEC ||
+ Reg == AMDGPU::EXEC_LO)
+ return true;
+ }
+ return false;
+}
+
+bool isVCndMask(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ return false;
+ case AMDGPU::V_CNDMASK_B32_e32:
+ case AMDGPU::V_CNDMASK_B32_e64:
+ case AMDGPU::V_CNDMASK_B32_dpp:
+ case AMDGPU::V_CNDMASK_B32_sdwa:
+ case AMDGPU::V_CNDMASK_B64_PSEUDO:
+ return true;
+ }
+}
+
+
+bool isExecRegionOp(unsigned Op) {
+ switch (Op) {
+ default:
+ return false;
+ case AMDGPU::COPY:
+ case AMDGPU::S_MOV_B32:
+ case AMDGPU::S_MOV_B64:
+ return true;
+ }
+}
+
+bool isRestoreExec(const MachineInstr *MI) {
+ unsigned Op = MI->getOpcode();
+ if (!isExecRegionOp(Op))
+ return false;
+
+ return isWriteExec(MI);
+}
+
+const MachineInstr *
+findExecRegionBeginFromRegionEnd(const MachineInstr *MI,
+ const MachineRegisterInfo &MRI) {
+ const MachineOperand &MO = MI->getOperand(1);
+ if (!MO.isReg())
+ return nullptr;
+ unsigned Reg = MO.getReg();
+ const MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+ if (!Def)
+ return nullptr;
+ // Make sure the def is S_MOV Reg, Exec.
+ if (!isExecRegionOp(Def->getOpcode()))
+ return nullptr;
+ const MachineOperand &ExecMO = Def->getOperand(1);
+ if (!ExecMO.isReg())
+ return nullptr;
+ unsigned ExecReg = ExecMO.getReg();
+ if (ExecReg == AMDGPU::EXEC || ExecReg == AMDGPU::EXEC_LO)
+ return Def;
+ else
+ return nullptr;
+}
+
+bool isInsideExecRegion(const MachineInstr &MI, const MachineInstr &RegionBegin,
+ const MachineInstr &RegionEnd,
+ const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT) {
+ if (!DT.dominates(&RegionBegin, &MI))
+ return false;
+
+ const MachineBasicBlock *MBB = MI.getParent();
+ const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent();
+ if (MBB != RegionEndMBB) {
+ return PDT.dominates(RegionEndMBB, MBB);
+ } else {
+ // MachineLoop through the basic block until we find A or B.
+ MachineBasicBlock::const_iterator I = MBB->begin();
+ for (; I != MI.getIterator() && I != RegionEnd.getIterator(); ++I)
+ /*empty*/;
+
+ // RegionEnd post-dominates MI if MI is found first in the basic block.
+ return I == MI.getIterator();
+ }
+}
+
+bool isInsideExecRegion(const MachineBasicBlock &MBB,
+ const MachineInstr &RegionBegin,
+ const MachineInstr &RegionEnd,
+ const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT) {
+ const MachineBasicBlock *RegionBeginMBB = RegionBegin.getParent();
+ const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent();
+ if (!DT.dominates(RegionBeginMBB, &MBB))
+ return false;
+ return PDT.dominates(RegionEndMBB, &MBB);
+}
+
+// Map from BB to nearest Exec Region. How to build? Add every MBB unless already has smaller region?
+// Then when hit saveExec, propagate leaked users of define inside the exec region.
+
+} // namespace
+
+namespace llvm {
+// class DivergenceAnalysis
+DivergenceAnalysis::DivergenceAnalysis(
+ const MachineFunction &F, const MachineLoop *RegionLoop, const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI,
+ SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
+ // AMDGPU change begin.
+ DivergentJoinMapTy &JoinMap
+ // AMDGPU change end.
+ )
+ : F(F), MRI(F.getRegInfo()), RegionLoop(RegionLoop), DT(DT), PDT(PDT),
+ LI(LI), SDA(SDA), DivergentJoinMap(JoinMap), // AMDGPU change
+ IsLCSSAForm(IsLCSSAForm) {
+ const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+ SIRI = ST->getRegisterInfo();
+ SIII = ST->getInstrInfo();
+}
+
+void DivergenceAnalysis::markDivergent(const ValueTy DivVal) {
+ assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
+ // AMDGPU change begin.
+ LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+ dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI););
+ //AMDGPU change end.
+ DivergentValues.insert(DivVal);
+}
+
+// Mir change.
+void DivergenceAnalysis::markDivergent(const MachineInstr &I) {
+ for (const MachineOperand &DstMO : I.defs()) {
+ unsigned Reg = DstMO.getReg();
+ markDivergent(Reg);
+ }
+ DivergentInsts.insert(&I);
+}
+
+void DivergenceAnalysis::addUniformOverride(const ValueTy UniVal) {
+ // TODO: support uniform multi-def.
+ if (MRI.getUniqueVRegDef(UniVal) == nullptr)
+ return;
+
+ UniformOverrides.insert(UniVal);
+}
+
+void DivergenceAnalysis::addUniformOverride(const MachineInstr &I) {
+ for (const MachineOperand &DstMO : I.defs()) {
+ unsigned Reg = DstMO.getReg();
+ addUniformOverride(Reg);
+ }
+ UniformOverridesInsts.insert(&I);
+}
+
+bool DivergenceAnalysis::isBitUniform(
+ const MachineInstr &I, const llvm::MachineOperand &UseMO,
+ llvm::DenseMap<const MachineInstr *, bool> &Processed) const {
+ if (UseMO.isImm()) {
+ uint64_t val = UseMO.getImm();
+ // 0 and -1 are OK since all lanes are still the same.
+ if (val == 0 || val == -1)
+ return true;
+ else
+ return false;
+ }
+ if (!UseMO.isReg())
+ return true;
+ unsigned Reg = UseMO.getReg();
+ // Exec is always bituniform, because all active lanes are 1.
+ if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+ // SCC only has 1 bit. Always bituniform.
+ Reg == AMDGPU::SCC)
+ return true;
+
+ const MachineInstr *UseMI = nullptr;
+ if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO) {
+ // Try to find define of this VCC.
+ UseMI = findPhysicalDefineInSameMBB(&I, Reg);
+ } else {
+ UseMI = MRI.getUniqueVRegDef(Reg);
+ }
+ if (!UseMI) {
+ return false;
+ }
+
+ bool bResult = isBitUniform(*UseMI, Processed);
+ Processed[UseMI] = bResult;
+ return bResult;
+}
+
+bool DivergenceAnalysis::isBitUniform(
+ const MachineInstr &I,
+ llvm::DenseMap<const MachineInstr *, bool> &Processed) const {
+ auto it = Processed.find(&I);
+ if (it != Processed.end())
+ return it->second;
+ // For branch on MIR, need to make sure all activi lanes are the same.
+ // cmp of uniform value will make sure all active lanes are the same.
+ // Imm is also the same for all active lanes.
+ if (isDivergent(I))
+ return false;
+ // Uniform cmp is bit uniform.
+ if (I.isCompare())
+ return true;
+ if (isConstant(&I))
+ return true;
+
+ // Conservatively consider bituniform to be false.
+ Processed[&I] = false;
+
+ // If all operand is bit uniform, then result is bit uniform.
+ bool bAllOperandBitUniform = true;
+ for (const MachineOperand &UseMO : I.uses()) {
+ if (isBitUniform(I, UseMO, Processed))
+ continue;
+ bAllOperandBitUniform = false;
+ break;
+ }
+ return bAllOperandBitUniform;
+}
+
+bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const {
+ if (Term.getParent()->succ_size() <= 1)
+ return false;
+ switch (Term.getOpcode()) {
+ default: {
+ if (updateNormalInstruction(Term))
+ return true;
+ llvm::DenseMap<const MachineInstr *, bool> Processed;
+ // Check bit uniform here if not divergent.
+ return !isBitUniform(Term, Processed);
+ }
+ //case AMDGPU::AMDGPU_CALL_INDIRECT:
+ case AMDGPU::SI_CALL:
+ return true;
+ }
+}
+
+bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
+ // TODO function calls with side effects, etc
+ if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end())
+ return false;
+ if (DivergentInsts.find(&I) != DivergentInsts.end())
+ return true;
+ for (const auto &Op : I.uses()) {
+ if (!Op.isReg())
+ continue;
+ Register Reg = Op.getReg();
+ if (Reg.isPhysical()) {
+ if (Reg == AMDGPU::EXEC ||
+ Reg == AMDGPU::EXEC_LO ||
+ Reg == AMDGPU::SCC)
+ continue;
+ else
+ if (const MachineInstr *DefMI =
+ findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
+ if (isDivergent(*DefMI))
+ return true;
+ } else {
+ // If cannot find def in same MBB, just treat it as divergent.
+ return true;
+ }
+ } else {
+ if (isDivergent(Op.getReg()))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool DivergenceAnalysis::isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
+ const ValueTy Val,
+ const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
+ const MachineBasicBlock *DefBlock = &IncomingBlock; // AMDGPU change: Take def point as incoming block for constants.
+ const auto *Inst = MRI.getUniqueVRegDef(Val);
+ if (Inst == nullptr)
+ return true;
+ if (Inst)
+ DefBlock = Inst->getParent();
+
+ // check whether any divergent loop carrying Val terminates before control
+ // proceeds to ObservingBlock
+ for (const auto *MachineLoop = LI.getLoopFor(DefBlock); // AMDGPU change
+ MachineLoop != RegionLoop && !MachineLoop->contains(&ObservingBlock);
+ MachineLoop = MachineLoop->getParentLoop()) {
+ if (DivergentLoops.find(MachineLoop) != DivergentLoops.end())
+ return true;
+ }
+
+ return false;
+}
+
+// AMDGPU CHANGE BEGIN
+static bool HasIncomingUndefValue(const PHINode_ *Phi) {
+ for (unsigned I = 1, E = Phi->getNumOperands(); I != E; I += 2) {
+ const MachineOperand &Op = Phi->getOperand(I);
+ if (Op.isUndef())
+ return true;
+ }
+ return false;
+}
+
+// For case like
+// %163:sreg_64_xexec = S_MOV_B64 $exec
+//bb.1:
+//; predecessors: %bb.1, %bb.0
+// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
+// %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+// %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
+// %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
+// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//...
+// $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
+// S_CBRANCH_EXECNZ %bb.1, implicit $exec
+// The ... code after SAVEEXEC will be divergent if %168 is divergent.
+// Return the SaveExec which affect MI.
+// If not exist, return nullptr.
+static const MachineInstr *
+findSaveExec(const MachineInstr *MI,
+ const SmallVector<const MachineInstr *, 2> &SaveExecs) {
+ // No save exec.
+ if (SaveExecs.empty())
+ return nullptr;
+ if (SaveExecs.size() > 1)
+ llvm::report_fatal_error(
+ "Not support case where, MBB has more than one SaveExec");
+ const MachineInstr *SaveExec = SaveExecs.front();
+ const MachineBasicBlock *MBB = SaveExec->getParent();
+ // Make sure MI is after SaveExec by check it is not before SaveExec.
+ // Assume MBB.begin to SaveExec is short here.
+ bool bIsAfterSaveExec = true;
+ for (auto it = MBB->begin(); it != SaveExec->getIterator(); it++) {
+ if (MI == it) {
+ bIsAfterSaveExec = false;
+ break;
+ }
+ }
+ // Not affect by save exec.
+ if (!bIsAfterSaveExec)
+ return nullptr;
+
+ return SaveExec;
+}
+
+// When a Phi's parent isJoinDivergent,the case make phi divergent is that 2
+// incoming values merged from different path of a divergent branch.
+// isJoinDivergentOnlyOnSameIncomingValue will check for all
+// combinations of incoming values except the BB with same incoming value,
+// because if values are same then even divergent branch is not divergent.
+// For example phi a:A, b:B, a:C.
+// It will check (A,B) (B,C) but not (A, C) Because A
+// and C has same value a.
+// If only (A,C) is sharing divergent branch,
+// then phi a:A, b:B, a:C is still uniform.
+// DivergentJoinMap saving MachineBasicBlock pairs which on different path of a
+// divergent branch and joined at one block.
+// For example,
+// A
+// / \
+// | \
+// | \
+// B /
+// | \ /
+// | \ /
+// C D
+// | /
+// \ /
+// E
+// If A is uniform branch, B is divergent branch. Then only (C, D) will be saved
+// in DivergentJoinMap.
+// DivergentJoinMap is build with updateDisjointMap in
+// SyncDependenceAnalysis.cpp when SyncDependenceAnalysis::join_block is called.
+// It will only run on divergent branch, so (A, B) is not in
+// DivergentDisjointMap when A is uniform.
+static bool isJoinDivergentOnlyOnSameIncomingValue(
+ const PHINode_ &Phi, const DivergenceAnalysis *pDA, const MachineDominatorTree &DT,
+ DivergentJoinMapTy &DivergentJoinMap) {
+ // for phi which join divergent, if the incoming values from divergent
+ // branch are the same, the phi is still uniform.
+ // A
+ // | \
+ // | \
+ // B \
+ // |\ \
+ // | \ |
+ // C D E
+ // | / /
+ // \/ /
+ // \ /
+ // F
+ // for phi in F like.
+ // phi (a:C, a:D, b:E)
+ // If A is uniform branch, B is non-uniform branch, phi is uniform.
+ SmallDenseSet<unsigned, 8> ValueToBlockMap;
+ for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
+ const MachineOperand &Op = Phi.getOperand(I);
+ if (!Op.isReg())
+ continue;
+ unsigned Reg = Op.getReg();
+ if (pDA->isDivergent(Reg))
+ return false;
+
+ ValueToBlockMap.insert(Reg);
+ }
+ unsigned NumIncoming = (Phi.getNumOperands() - 1) / 2;
+ // When there's same incoming value from different incoming block.
+ // If divergent select is only on same value, then it is still uniform.
+ if (ValueToBlockMap.size() != NumIncoming) {
+ // When a phi is on divergent join block, there is incoming block which is
+ // comeing from different path of a divergent branch.
+ // Check all combination here.
+ for (unsigned i = 0; i < NumIncoming; i++) {
+ MachineBasicBlock *BB0 = Phi.getOperand(2 + 2 * i).getMBB();
+ const MachineOperand &MO0 = Phi.getOperand(1 + 2 * i);
+ for (unsigned j = i + 1; j < NumIncoming; j++) {
+ MachineBasicBlock *BB1 = Phi.getOperand(2 + 2 * j).getMBB();
+ const MachineOperand &MO1 = Phi.getOperand(1 + 2 * j);
+ // If value match, no divergent.
+ if (MO0.isImm() && MO1.isImm() && MO0.getImm() == MO1.getImm())
+ continue;
+ if (MO0.isReg() && MO1.isReg() && MO0.getReg() == MO1.getReg() &&
+ MO0.getSubReg() == MO1.getSubReg())
+ continue;
+
+ // If BB and BB2 is from divergent disjoint, then they will
+ // divergent join on phi.
+ // This is for case like
+ // A
+ // / \
+ // | \
+ // | \
+ // B /
+ // | \ /
+ // | \ /
+ // C D
+ // | /
+ // \ /
+ // E
+ //
+ // phi(a:C, b:D)
+ // When nearestCommonDominator is A, but B also can be divergent
+ // disjoint for C and D.
+ if (DivergentJoinMap[BB0].count(BB1))
+ return false;
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+// AMDGPU CHANGE END
+
+bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
+ // AMDGPU CHANGE BEGIN
+ // Do not mark phis with undef as incoming values as uniform.
+ // When promoting to scalar we will readfirstlane on
+ // the phi output. If some of the inputs are undef then
+ // this could replace a well defined vector value with an
+ // undefined scalar value.
+ if (HasIncomingUndefValue(&Phi))
+ return true;
+ // AMDGPU CHANGE END
+
+ // joining divergent disjoint path in Phi parent block
+ if (isJoinDivergent(*Phi.getParent())) {
+ // AMDGPU CHANGE BEGIN
+ if (true/*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
+ // Continue if the divergent join only on same incoming value.
+ if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT,
+ DivergentJoinMap))
+ return true;
+ } else
+ // AMDGPU CHANGE END
+ return true;
+ }
+
+ // An incoming value could be divergent by itself.
+ // Otherwise, an incoming value could be uniform within the loop
+ // that carries its definition but it may appear divergent
+ // from outside the loop. This happens when divergent loop exits
+ // drop definitions of that uniform value in different iterations.
+ //
+ // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
+ // if (i % thread_id == 0) break; // divergent loop exit
+ // }
+ // int divI = i; // divI is divergent
+ for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
+ const MachineOperand &Op = Phi.getOperand(I);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Reg = Op.getReg();
+ const MachineOperand &BB = Phi.getOperand(I + 1);
+ if (isDivergent(Reg) ||
+ isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB()))
+ return true;
+
+ }
+
+ return false;
+}
+
+bool DivergenceAnalysis::updateVCndMask(const MachineInstr &VCndMask) const {
+ // VCndMask require the Cond bituniform to be uniform.
+ unsigned Op = VCndMask.getOpcode();
+ unsigned src0Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src0);
+ unsigned src1Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src1);
+ unsigned src2Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src2);
+
+ const MachineOperand &src0 = VCndMask.getOperand(src0Idx);
+ const MachineOperand &src1 = VCndMask.getOperand(src1Idx);
+
+ const MachineOperand &cond = VCndMask.getOperand(src2Idx);
+
+ if (isDivergent(src0))
+ return true;
+
+ // If src0 == src1, then return src0 divergent.
+ if (src0.isReg() && src1.isReg() && src0.getReg() == src1.getReg()) {
+ if (src0.getSubReg() == src1.getSubReg() &&
+ SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src0_modifiers) ==
+ SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src1_modifiers))
+ return false;
+ }
+
+ if (isDivergent(src1))
+ return true;
+
+ llvm::DenseMap<const MachineInstr *, bool> Processed;
+ return !isBitUniform(VCndMask, cond, Processed);
+}
+
+bool DivergenceAnalysis::inRegion(const MachineInstr &I) const {
+ return I.getParent() && inRegion(*I.getParent());
+}
+
+bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const {
+ return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
+}
+
+// marks all users of loop-carried values of the loop headed by LoopHeader as
+// divergent
+void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) {
+ auto *DivLoop = LI.getLoopFor(&LoopHeader);
+ assert(DivLoop && "loopHeader is not actually part of a loop");
+
+ SmallVector<MachineBasicBlock *, 8> TaintStack;
+ DivLoop->getExitBlocks(TaintStack);
+
+ // Otherwise potential users of loop-carried values could be anywhere in the
+ // dominance region of DivLoop (including its fringes for phi nodes)
+ DenseSet<const MachineBasicBlock *> Visited;
+ for (auto *Block : TaintStack) {
+ Visited.insert(Block);
+ }
+ Visited.insert(&LoopHeader);
+
+ while (!TaintStack.empty()) {
+ auto *UserBlock = TaintStack.back();
+ TaintStack.pop_back();
+
+ // don't spread divergence beyond the region
+ if (!inRegion(*UserBlock))
+ continue;
+
+ assert(!DivLoop->contains(UserBlock) &&
+ "irreducible control flow detected");
+
+ // phi nodes at the fringes of the dominance region
+ if (!DT.dominates(&LoopHeader, UserBlock)) {
+ // all PHI nodes of UserBlock become divergent
+ pushPHINodes(*UserBlock);
+ continue;
+ }
+
+ // taint outside users of values carried by DivLoop
+ for (auto &I : *UserBlock) {
+ if (isAlwaysUniformMI(&I, SIII, SIRI, MRI))
+ continue;
+ if (isDivergent(I))
+ continue;
+
+ for (auto &Op : I.uses()) {
+ if (!Op.isReg())
+ continue;
+ unsigned OpReg = Op.getReg();
+ MachineInstr *OpInst = MRI.getUniqueVRegDef(OpReg);
+ if (!OpInst)
+ continue;
+ if (DivLoop->contains(OpInst->getParent())) {
+ markDivergent(I);
+ pushUsers(I);
+ break;
+ }
+ }
+ }
+
+ // visit all blocks in the dominance region
+ for (auto *SuccBlock : UserBlock->successors()) {
+ if (!Visited.insert(SuccBlock).second) {
+ continue;
+ }
+ TaintStack.push_back(SuccBlock);
+ }
+ }
+}
+
+void DivergenceAnalysis::pushInstruction(const MachineInstr &I) {
+ Worklist.push_back(&I);
+}
+void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) {
+ for (const auto &Phi : Block.phis()) {
+ if (isDivergent(Phi))
+ continue;
+ pushInstruction(Phi);
+ }
+}
+
+void DivergenceAnalysis::pushUsers(const ValueTy V) {
+ for (const auto &UserInst : MRI.use_nodbg_instructions(V)) {
+
+ if (isDivergent(UserInst))
+ continue;
+
+ // only compute divergent inside loop
+ if (!inRegion(UserInst))
+ continue;
+
+ Worklist.push_back(&UserInst);
+ }
+}
+void DivergenceAnalysis::pushUsers(const MachineInstr &I) {
+ for (const auto &DstMO : I.defs()) {
+ unsigned Reg = DstMO.getReg();
+ pushUsers(Reg);
+ }
+}
+
+bool DivergenceAnalysis::propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
+ const MachineLoop *BranchLoop) {
+ LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
+
+ // ignore divergence outside the region
+ if (!inRegion(JoinBlock)) {
+ return false;
+ }
+
+ // push non-divergent phi nodes in JoinBlock to the worklist
+ pushPHINodes(JoinBlock);
+
+ // JoinBlock is a divergent loop exit
+ if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
+ return true;
+ }
+
+ // disjoint-paths divergent at JoinBlock
+ markBlockJoinDivergent(JoinBlock);
+ return false;
+}
+
+void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) {
+ LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
+
+ markDivergent(Term);
+
+ const auto *BranchLoop = LI.getLoopFor(Term.getParent());
+
+ // whether there is a divergent loop exit from BranchLoop (if any)
+ bool IsBranchLoopDivergent = false;
+
+ // iterate over all blocks reachable by disjoint from Term within the loop
+ // also iterates over loop exits that become divergent due to Term.
+ for (const auto *JoinBlock : SDA.join_blocks(Term)) {
+ IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+ }
+
+ // Branch loop is a divergent loop due to the divergent branch in Term
+ if (IsBranchLoopDivergent) {
+ assert(BranchLoop);
+ if (!DivergentLoops.insert(BranchLoop).second) {
+ return;
+ }
+ propagateLoopDivergence(*BranchLoop);
+ }
+}
+
+void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) {
+ LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() << "\n");
+
+ // don't propagate beyond region
+ if (!inRegion(*ExitingLoop.getHeader()))
+ return;
+
+ const auto *BranchLoop = ExitingLoop.getParentLoop();
+
+ // Uses of loop-carried values could occur anywhere
+ // within the dominance region of the definition. All loop-carried
+ // definitions are dominated by the loop header (reducible control).
+ // Thus all users have to be in the dominance region of the loop header,
+ // except PHI nodes that can also live at the fringes of the dom region
+ // (incoming defining value).
+ if (!IsLCSSAForm)
+ taintLoopLiveOuts(*ExitingLoop.getHeader());
+
+ // whether there is a divergent loop exit from BranchLoop (if any)
+ bool IsBranchLoopDivergent = false;
+
+ // iterate over all blocks reachable by disjoint paths from exits of
+ // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
+ // become divergent.
+ for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
+ IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+ }
+
+ // Branch loop is a divergent due to divergent loop exit in ExitingLoop
+ if (IsBranchLoopDivergent) {
+ assert(BranchLoop);
+ if (!DivergentLoops.insert(BranchLoop).second) {
+ return;
+ }
+ propagateLoopDivergence(*BranchLoop);
+ }
+}
+
+// For case like
+// %149:sreg_64_xexec = S_MOV_B64 $exec
+//
+//bb.3:
+//; predecessors: %bb.3, %bb.2
+// successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), %bb.4(50.00%)
+//
+// %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3
+// %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec
+// %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec
+// %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+// $m0 = S_MOV_B32 %153:sgpr_32
+// %55:vreg_512 = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit $exec
+// $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
+// S_CBRANCH_EXECNZ %bb.3, implicit $exec
+//
+//bb.4:
+//; predecessors: %bb.3
+// successors: %bb.5(0x80000000); %bb.5(100.00%)
+//
+// $exec = S_MOV_B64 %149:sreg_64_xexec
+
+// bb.3 is inside exec region which exec is saved by %149.
+// %152:sreg_64 = S_AND_SAVEEXEC_B64 will update the exec which cause divergence
+// when it is not bituniform. Everything inside the exec region need to be
+// scaned. Out region or phi use should be marked as divergent and add users to
+// worklist.
+void DivergenceAnalysis::propagateExecControlFlowDivergence(
+ const MachineInstr &SaveExec) {
+ const MachineBasicBlock *MBB = SaveExec.getParent();
+ auto it = ExecRegionMap.find(MBB);
+ if (it == ExecRegionMap.end())
+ return;
+ ExecRegion &Region = *it->second;
+ // One region only need to propagate once.
+ if (Region.bPropagated)
+ return;
+ Region.bPropagated = true;
+ // Scan all MIs in the region. Mark out region or phi use as divergent and add
+ // their users to worklist.
+ auto propagateExecDivergence = [this, Region](const MachineInstr *MI) {
+ for (const auto &DstMO : MI->defs()) {
+ Register Reg = DstMO.getReg();
+ // Only VCC/Exec/m0.
+ // Exec always uniform. Assume VCC and m0 not cross region.
+ if (Reg.isPhysical())
+ continue;
+ for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) {
+
+ if (isDivergent(UserInst))
+ continue;
+
+ // only propagate user outside of region or phi which will not be
+ // guarded by saveExec.
+ if (UserInst.getOpcode() != AMDGPU::PHI &&
+ isInsideExecRegion(UserInst, *Region.begin, *Region.end, DT, PDT)) {
+ continue;
+ }
+ // Write exec is not divergent.
+ if (isWriteExec(&UserInst))
+ continue;
+
+ markDivergent(UserInst);
+ pushUsers(UserInst);
+ }
+ }
+ };
+ const MachineBasicBlock *RegionBeginMBB = Region.begin->getParent();
+ const MachineBasicBlock *RegionEndMBB = Region.end->getParent();
+ if (RegionBeginMBB != RegionEndMBB) {
+ auto it = Region.begin->getIterator();
+ for (it++; it != RegionBeginMBB->end(); it++) {
+ const MachineInstr &MI = *it;
+ propagateExecDivergence(&MI);
+ }
+
+ // All blocks between RegionBeginMBB and RegionEndMBB.
+ for (const MachineBasicBlock *MBB : Region.blocks) {
+ for (const MachineInstr &MI : *MBB) {
+ propagateExecDivergence(&MI);
+ }
+ }
+
+ for (auto it = RegionEndMBB->begin(); it != Region.end->getIterator();
+ it++) {
+ const MachineInstr &MI = *it;
+ propagateExecDivergence(&MI);
+ }
+
+ } else {
+ auto it = Region.begin->getIterator();
+ for (it++; it != Region.end->getIterator(); it++) {
+ const MachineInstr &MI = *it;
+ propagateExecDivergence(&MI);
+ }
+ }
+}
+
+void DivergenceAnalysis::compute() {
+ SmallVector<ExecRegion, 4> ExecRegions;
+ // Build exec regions.
+ // Add VCndMask for non-bituniform caused by input sreg.
+ for (const MachineBasicBlock &MBB : F) {
+ for (const MachineInstr &Term : MBB.terminators()) {
+ if (updateTerminator(Term))
+ pushInstruction(Term);
+ }
+
+ for (const MachineInstr &I : MBB) {
+ unsigned Opcode = I.getOpcode();
+ if (isVCndMask(Opcode)) {
+ // Cond for CndMask needs bit uniform check.
+ // Add it to worklist to check bit uniform from input.
+ pushInstruction(I);
+ } else if (isRestoreExec(&I)) {
+ const MachineInstr *RegionBegin =
+ findExecRegionBeginFromRegionEnd(&I, MRI);
+ if (RegionBegin) {
+ ExecRegions.emplace_back(ExecRegion(RegionBegin, &I));
+ }
+ }
+ }
+ }
+
+ // Build exec region map.
+ for (const MachineBasicBlock &MBB : F) {
+ for (ExecRegion &Region : ExecRegions) {
+ if (isInsideExecRegion(MBB, *Region.begin, *Region.end, DT, PDT)) {
+ // Add block to region.
+ if (&MBB != Region.begin->getParent() &&
+ &MBB != Region.end->getParent())
+ Region.blocks.emplace_back(&MBB);
+ // Update ExecRegionMap.
+ auto it = ExecRegionMap.find(&MBB);
+ if (it == ExecRegionMap.end()) {
+ ExecRegionMap[&MBB] = &Region;
+ } else {
+ // When MBB inside multiple regions, save the smallest one.
+ if (isInsideExecRegion(*Region.begin, *it->second->begin,
+ *it->second->end, DT, PDT)) {
+ ExecRegionMap[&MBB] = &Region;
+ }
+ }
+ }
+ }
+ }
+
+ for (auto DivVal : DivergentValues) {
+ LLVM_DEBUG(dbgs() << "\t sourceOfDivergence :"; printReg(DivVal, SIRI);
+ dbgs() << "\n";);
+ pushUsers(DivVal);
+ }
+
+ // propagate divergence
+ while (!Worklist.empty()) {
+ const MachineInstr *I= Worklist.back();
+ Worklist.pop_back();
+
+ // maintain uniformity of overrides
+ if (isAlwaysUniformMI(I, SIII, SIRI, MRI)) {
+ // If used by terminators, and not bit uniform.
+ // Add terminator.
+ SmallVector<const MachineInstr *, 2> TermUsers;
+ for (const auto &DstMO : I->defs()) {
+ unsigned Reg = DstMO.getReg();
+ for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) {
+
+ if (isDivergent(UserInst))
+ continue;
+ // Only check terminator here.
+ if (!UserInst.isTerminator())
+ continue;
+
+ // only compute divergent inside loop
+ if (!inRegion(UserInst))
+ continue;
+
+ TermUsers.emplace_back(&UserInst);
+ }
+ }
+
+ if (!TermUsers.empty()) {
+ llvm::DenseMap<const MachineInstr *, bool> Processed;
+ if (!isBitUniform(*I, Processed)) {
+ for (const MachineInstr *Term : TermUsers) {
+ Worklist.emplace_back(Term);
+ }
+ }
+ }
+
+ continue;
+ }
+
+ bool WasDivergent = isDivergent(*I);
+ if (WasDivergent)
+ continue;
+
+ // propagate divergence caused by terminator
+ if (I->isTerminator()) {
+ if (updateTerminator(*I)) {
+ // propagate control divergence to affected instructions
+ propagateBranchDivergence(*I);
+ continue;
+ }
+ }
+
+ // update divergence of I due to divergent operands
+ bool DivergentUpd = false;
+ unsigned Opcode = I->getOpcode();
+ switch (I->getOpcode()) {
+ default:
+ if (isVCndMask(Opcode)) {
+ DivergentUpd = updateVCndMask(*I);
+ } else {
+ DivergentUpd = updateNormalInstruction(*I);
+ llvm::DenseMap<const MachineInstr *, bool> Processed;
+ if ((DivergentUpd || !isBitUniform(*I, Processed)) && isWriteExec(I)) {
+ // propagate exec control divergence to affected instructions.
+ propagateExecControlFlowDivergence(*I);
+ }
+ }
+ break;
+ case AMDGPU::PHI:
+ DivergentUpd = updatePHINode(*I);
+ break;
+ }
+
+ // propagate value divergence to users
+ if (DivergentUpd) {
+ markDivergent(*I);
+ pushUsers(*I);
+ }
+ }
+}
+
+bool DivergenceAnalysis::isAlwaysUniform(const ValueTy V) const {
+ return UniformOverrides.find(V) != UniformOverrides.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const ValueTy V) const {
+ return DivergentValues.find(V) != DivergentValues.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const MachineOperand &MO) const {
+ if (!MO.isReg())
+ return false;
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical()) {
+ const MachineInstr *MI = MO.getParent();
+ if (MI)
+ return isDivergent(!MI);
+
+ } else {
+ return isDivergent(Reg);
+ }
+ return true;
+}
+
+bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const {
+ if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end())
+ return false;
+ if (DivergentInsts.find(&I) != DivergentInsts.end())
+ return true;
+ for (const MachineOperand &DstMO : I.defs()) {
+ unsigned Reg = DstMO.getReg();
+ if (isDivergent(Reg))
+ return true;
+ }
+ return false;
+}
+
+void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const {
+ // iterate instructions using instructions() to ensure a deterministic order.
+ for (auto &MBB : F)
+ for (auto &I : MBB) {
+ if (isDivergent(I))
+ OS << "DIVERGENT:" << I ;
+ // AMDGPU changes begin
+ else
+ OS << "UNIFORM:" << I ;
+ // AMDGPU changes end
+ }
+}
+
+// class GPUDivergenceAnalysis
+MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(MachineFunction &F,
+ const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT,
+ const MachineLoopInfo &LI)
+ : SDA(DT, PDT, LI, /*AMDGPU change*/DivergentJoinMap),
+ DA(F, nullptr, DT, PDT, LI, SDA, false, /*AMDGPU change*/DivergentJoinMap) {
+ MachineRegisterInfo &MRI = F.getRegInfo();
+ const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+ const SIInstrInfo *SIII = ST->getInstrInfo();
+ for (auto &MBB : F)
+ for (auto &I : MBB) {
+ if (isSourceOfDivergence(&I, MRI, SIRI, SIII)) {
+ DA.markDivergent(I);
+ } else if (isAlwaysUniformMI(&I, SIII, SIRI, MRI)) {
+ DA.addUniformOverride(I);
+ }
+ }
+ for (auto &ArgIt : F.getRegInfo().liveins()) {
+ unsigned Reg = ArgIt.first;
+ if (isDivergentInputReg(Reg, MRI, SIRI)) {
+ DA.markDivergent(Reg);
+ }
+ }
+
+ DA.compute();
+}
+
+bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const {
+ return DA.isDivergent(*I);
+}
+
+void MirGPUDivergenceAnalysis::print(raw_ostream &OS, const Module_ *mod) const {
+ OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
+ DA.print(OS, mod);
+ OS << "}\n";
+}
+
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
new file mode 100644
index 000000000000000..edcf96ec44a4d59
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
@@ -0,0 +1,281 @@
+//===- AMDGPUMirDivergenceAnalysis.h - Mir Divergence Analysis -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// The divergence analysis determines which instructions and branches are
+// divergent given a set of divergent source instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
+#include "llvm/Pass.h"
+#include <vector>
+
+namespace llvm {
+class raw_ostream;
+class TargetTransformInfo;
+class MachineRegisterInfo;
+class SIInstrInfo;
+class SIRegisterInfo;
+class MachineOperand;
+class MachineBasicBlock;
+
+using Module_ = void;
+class TargetTransformInfo;
+using ValueTy = unsigned;
+using PHINode_ = MachineInstr;
+
+/// \brief Generic divergence analysis for reducible CFGs.
+///
+/// This analysis propagates divergence in a data-parallel context from sources
+/// of divergence to all users. It requires reducible CFGs. All assignments
+/// should be in SSA form.
+class DivergenceAnalysis {
+public:
+ /// \brief This instance will analyze the whole function \p F or the loop \p
+ /// RegionLoop.
+ ///
+ /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop.
+ /// Otherwise the whole function is analyzed.
+ /// \param IsLCSSAForm whether the analysis may assume that the IR in the
+ /// region in in LCSSA form.
+ DivergenceAnalysis(const llvm::MachineFunction &F, const MachineLoop *RegionLoop,
+ const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+ const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA,
+ bool IsLCSSAForm,
+ // AMDGPU change begin.
+ DivergentJoinMapTy &JoinMap
+ // AMDGPU change end.
+ );
+
+ /// \brief The loop that defines the analyzed region (if any).
+ const MachineLoop *getRegionLoop() const { return RegionLoop; }
+ const llvm::MachineFunction &getFunction() const { return F; }
+
+ /// \brief Whether \p BB is part of the region.
+ bool inRegion(const MachineBasicBlock &BB) const;
+ /// \brief Whether \p I is part of the region.
+ bool inRegion(const MachineInstr &I) const;
+
+ /// \brief Mark \p UniVal as a value that is always uniform.
+ void addUniformOverride(const ValueTy UniVal);
+ void addUniformOverride(const MachineInstr &I);
+
+ /// \brief Mark \p DivVal as a value that is always divergent.
+ void markDivergent(const ValueTy DivVal);
+ void markDivergent(const MachineInstr &I);
+
+ /// \brief Propagate divergence to all instructions in the region.
+ /// Divergence is seeded by calls to \p markDivergent.
+ void compute();
+
+ /// \brief Whether any value was marked or analyzed to be divergent.
+ bool hasDetectedDivergence() const { return !DivergentValues.empty(); }
+
+ /// \brief Whether \p Val will always return a uniform value regardless of its
+ /// operands
+ bool isAlwaysUniform(const ValueTy Val) const;
+
+ /// \brief Whether \p Val is a divergent value
+ bool isDivergent(const ValueTy Val) const;
+ bool isDivergent(const MachineInstr &I) const;
+
+ void print(llvm::raw_ostream &OS, const Module_ *) const;
+
+private:
+ bool isDivergent(const llvm::MachineOperand &MO) const;
+ bool updateTerminator(const MachineInstr &Term) const;
+ bool updatePHINode(const PHINode_ &Phi) const;
+ bool updateVCndMask(const MachineInstr &VCndMask) const;
+ bool isBitUniform(const MachineInstr &I,
+ llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+ bool isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
+ llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+
+ /// \brief Computes whether \p Inst is divergent based on the
+ /// divergence of its operands.
+ ///
+ /// \returns Whether \p Inst is divergent.
+ ///
+ /// This should only be called for non-phi, non-terminator instructions.
+ bool updateNormalInstruction(const MachineInstr &Inst) const;
+
+ /// \brief Mark users of live-out users as divergent.
+ ///
+ /// \param LoopHeader the header of the divergent loop.
+ ///
+ /// Marks all users of live-out values of the loop headed by \p LoopHeader
+ /// as divergent and puts them on the worklist.
+ void taintLoopLiveOuts(const MachineBasicBlock &LoopHeader);
+
+ /// \brief Push all users of \p Val (in the region) to the worklist
+ void pushUsers(const ValueTy I);
+ void pushUsers(const MachineInstr &I);
+
+ void pushInstruction(const MachineInstr &I);
+ /// \brief Push all phi nodes in @block to the worklist
+ void pushPHINodes(const MachineBasicBlock &Block);
+
+ /// \brief Mark \p Block as join divergent
+ ///
+ /// A block is join divergent if two threads may reach it from different
+ /// incoming blocks at the same time.
+ void markBlockJoinDivergent(const MachineBasicBlock &Block) {
+ DivergentJoinBlocks.insert(&Block);
+ }
+
+ /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
+ bool isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
+ const ValueTy Val,
+ const MachineBasicBlock &incomingBlock) const; // AMDGPU change
+
+ /// \brief Whether \p Block is join divergent
+ ///
+ /// (see markBlockJoinDivergent).
+ bool isJoinDivergent(const MachineBasicBlock &Block) const {
+ return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end();
+ }
+
+ /// \brief Propagate control-induced divergence to users (phi nodes and
+ /// instructions).
+ //
+ // \param JoinBlock is a divergent loop exit or join point of two disjoint
+ // paths.
+ // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop.
+ bool propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
+ const MachineLoop *TermLoop);
+
+ /// \brief Propagate induced value divergence due to control divergence in \p
+ /// Term.
+ void propagateBranchDivergence(const MachineInstr &Term);
+
+ /// \brief Propagate induced value divergence due to exec update caused by \p
+ /// SaveExec.
+ void propagateExecControlFlowDivergence(const MachineInstr &SaveExec);
+
+ /// \brief Propagate divergent caused by a divergent loop exit.
+ ///
+ /// \param ExitingLoop is a divergent loop.
+ void propagateLoopDivergence(const MachineLoop &ExitingLoop);
+
+private:
+ const llvm::MachineFunction &F;
+ const llvm::MachineRegisterInfo &MRI;
+ const llvm::SIRegisterInfo *SIRI;
+ const llvm::SIInstrInfo *SIII;
+ // If regionLoop != nullptr, analysis is only performed within \p RegionLoop.
+ // Otw, analyze the whole function
+ const MachineLoop *RegionLoop;
+
+ const MachineDominatorTree &DT;
+ const MachinePostDominatorTree &PDT;
+ const MachineLoopInfo &LI;
+
+ // Recognized divergent loops
+ llvm::DenseSet<const MachineLoop *> DivergentLoops;
+
+ // AMDGPU change begin
+ // Save block pair which divergent disjoint.
+ // A
+ // | \
+ // | \
+ // B C
+ // | /
+ // D
+ // When A is divergent branch, B and C are divergent join at D.
+ // Then DivergentJoinMap[B].count(C) > 0 and
+ // DivergentJoinMap[C].count(B) > 0.
+ DivergentJoinMapTy &DivergentJoinMap;
+ // AMDGPU change end
+
+ // The SDA links divergent branches to divergent control-flow joins.
+ SyncDependenceAnalysis &SDA;
+
+ // Use simplified code path for LCSSA form.
+ bool IsLCSSAForm;
+
+ // Set of known-uniform values.
+ llvm::DenseSet<unsigned> UniformOverrides;
+ llvm::DenseSet<const llvm::MachineInstr*> UniformOverridesInsts;
+
+ // Blocks with joining divergent control from different predecessors.
+ llvm::DenseSet<const MachineBasicBlock *> DivergentJoinBlocks;
+
+ // Detected/marked divergent values.
+ llvm::DenseSet<unsigned> DivergentValues;
+ llvm::DenseSet<const llvm::MachineInstr*> DivergentInsts;
+
+ // Mir change for EXEC control flow.
+ // Map from MBB to the exec region it belongs too.
+ // A exec region is begin with
+ // S_MOV_B64 sreg, exec
+ // end with
+ // S_MOV_B64 exec, sreg
+ // Inside the region, exec might be updated to make control flow with exec.
+ struct ExecRegion {
+ const llvm::MachineInstr *begin;
+ const llvm::MachineInstr *end;
+ std::vector<const llvm::MachineBasicBlock*> blocks;
+ bool bPropagated = false;
+ ExecRegion(const llvm::MachineInstr *b,
+ const llvm::MachineInstr *e)
+ : begin(b), end(e), bPropagated(false) {}
+ };
+ llvm::DenseMap<const llvm::MachineBasicBlock *, ExecRegion *> ExecRegionMap;
+
+ // Internal worklist for divergence propagation.
+ std::vector<const llvm::MachineInstr*> Worklist;
+};
+
+/// \brief Divergence analysis frontend for GPU kernels.
+class MirGPUDivergenceAnalysis {
+ // AMDGPU change begin
+ // Save block pair which divergent disjoint.
+ // A
+ // | \
+ // | \
+ // B C
+ // | /
+ // D
+ // When A is divergent branch, B and C are divergent join at D.
+ // Then DivergentJoinMap[B].count(C) > 0 and
+ // DivergentJoinMap[C].count(B) > 0.
+ DivergentJoinMapTy DivergentJoinMap;
+ // AMDGPU change end
+ SyncDependenceAnalysis SDA;
+ DivergenceAnalysis DA;
+
+public:
+ /// Runs the divergence analysis on @F, a GPU kernel
+ MirGPUDivergenceAnalysis(llvm::MachineFunction &F, const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI);
+
+ /// Whether any divergence was detected.
+ bool hasDivergence() const { return DA.hasDetectedDivergence(); }
+
+ /// The GPU kernel this analysis result is for
+ const llvm::MachineFunction &getFunction() const { return DA.getFunction(); }
+
+ /// Whether \p I is divergent.
+ bool isDivergent(const MachineInstr *I) const;
+
+ /// Whether \p I is uniform/non-divergent
+ bool isUniform(const MachineInstr *I) const { return !isDivergent(I); }
+
+ /// Print all divergent values in the kernel.
+ void print(llvm::raw_ostream &OS, const Module_ *) const;
+};
+
+} // namespace llvm
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
new file mode 100644
index 000000000000000..7213f7b4b11b4c6
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
@@ -0,0 +1,511 @@
+//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence Calculation
+//--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is based on Analysis/MirSyncDependenceAnalysis.cpp, just change
+// MachineBasicBlock to MachineBasicBlock.
+// This file implements an algorithm that returns for a divergent branch
+// the set of basic blocks whose phi nodes become divergent due to divergent
+// control. These are the blocks that are reachable by two disjoint paths from
+// the branch or loop exits that have a reaching path that is disjoint from a
+// path to the loop latch.
+//
+// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
+// control-induced divergence in phi nodes.
+//
+// -- Summary --
+// The SyncDependenceAnalysis lazily computes sync dependences [3].
+// The analysis evaluates the disjoint path criterion [2] by a reduction
+// to SSA construction. The SSA construction algorithm is implemented as
+// a simple data-flow analysis [1].
+//
+// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
+// [2] "Efficiently Computing Static Single Assignment Form
+// and the Control Dependence Graph", TOPLAS '91,
+// Cytron, Ferrante, Rosen, Wegman and Zadeck
+// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
+// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
+//
+// -- Sync dependence --
+// Sync dependence [4] characterizes the control flow aspect of the
+// propagation of branch divergence. For example,
+//
+// %cond = icmp slt i32 %tid, 10
+// br i1 %cond, label %then, label %else
+// then:
+// br label %merge
+// else:
+// br label %merge
+// merge:
+// %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// -- Reduction to SSA construction --
+// There are two disjoint paths from A to X, if a certain variant of SSA
+// construction places a phi node in X under the following set-up scheme [2].
+//
+// This variant of SSA construction ignores incoming undef values.
+// That is paths from the entry without a definition do not result in
+// phi nodes.
+//
+// entry
+// / \
+// A \
+// / \ Y
+// B C /
+// \ / \ /
+// D E
+// \ /
+// F
+// Assume that A contains a divergent branch. We are interested
+// in the set of all blocks where each block is reachable from A
+// via two disjoint paths. This would be the set {D, F} in this
+// case.
+// To generally reduce this query to SSA construction we introduce
+// a virtual variable x and assign to x different values in each
+// successor block of A.
+// entry
+// / \
+// A \
+// / \ Y
+// x = 0 x = 1 /
+// \ / \ /
+// D E
+// \ /
+// F
+// Our flavor of SSA construction for x will construct the following
+// entry
+// / \
+// A \
+// / \ Y
+// x0 = 0 x1 = 1 /
+// \ / \ /
+// x2=phi E
+// \ /
+// x3=phi
+// The blocks D and F contain phi nodes and are thus each reachable
+// by two disjoins paths from A.
+//
+// -- Remarks --
+// In case of loop exits we need to check the disjoint path criterion for loops
+// [2]. To this end, we check whether the definition of x differs between the
+// loop exit and the loop header (_after_ SSA construction).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+#include <stack>
+#include <unordered_set>
+
+#define DEBUG_TYPE "sync-dependence"
+
+namespace llvm {
+
+ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
+
+SyncDependenceAnalysis::SyncDependenceAnalysis(const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT,
+ const MachineLoopInfo &LI,
+ // AMDGPU change begin.
+ DivergentJoinMapTy &JoinMap
+ // AMDGPU change end.
+ )
+ : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI),
+ // AMDGPU change begin.
+ DivergentJoinMap(JoinMap)
+ // AMDGPU change end.
+{
+}
+
+SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
+
+using FunctionRPOT = ReversePostOrderTraversal<const MachineFunction *>;
+
+// divergence propagator for reducible CFGs
+struct DivergencePropagator {
+ const FunctionRPOT &FuncRPOT;
+ const MachineDominatorTree &DT;
+ const MachinePostDominatorTree &PDT;
+ const MachineLoopInfo &LI;
+
+ // identified join points
+ std::unique_ptr<ConstBlockSet> JoinBlocks;
+
+ // reached loop exits (by a path disjoint to a path to the loop header)
+ SmallPtrSet<const MachineBasicBlock *, 4> ReachedLoopExits;
+
+ // if DefMap[B] == C then C is the dominating definition at block B
+ // if DefMap[B] ~ undef then we haven't seen B yet
+ // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
+ // an immediate successor of X (initial value).
+ using DefiningBlockMap = std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
+ DefiningBlockMap DefMap;
+
+ // all blocks with pending visits
+ std::unordered_set<const MachineBasicBlock *> PendingUpdates;
+
+ DivergencePropagator(const FunctionRPOT &FuncRPOT, const MachineDominatorTree &DT,
+ const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+ : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
+ JoinBlocks(new ConstBlockSet) {}
+
+ // set the definition at @block and mark @block as pending for a visit
+ void addPending(const MachineBasicBlock &Block, const MachineBasicBlock &DefBlock) {
+ bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
+ if (WasAdded)
+ PendingUpdates.insert(&Block);
+ }
+
+ void printDefs(raw_ostream &Out) {
+ Out << "Propagator::DefMap {\n";
+ for (const auto *Block : FuncRPOT) {
+ auto It = DefMap.find(Block);
+ Out << Block->getName() << " : ";
+ if (It == DefMap.end()) {
+ Out << "\n";
+ } else {
+ const auto *DefBlock = It->second;
+ Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n";
+ }
+ }
+ Out << "}\n";
+ }
+
+ // process @succBlock with reaching definition @defBlock
+ // the original divergent branch was in @parentLoop (if any)
+ void visitSuccessor(const MachineBasicBlock &SuccBlock, const MachineLoop *ParentLoop,
+ const MachineBasicBlock &DefBlock) {
+
+ // @succBlock is a loop exit
+ if (ParentLoop && !ParentLoop->contains(&SuccBlock)) {
+ DefMap.emplace(&SuccBlock, &DefBlock);
+ ReachedLoopExits.insert(&SuccBlock);
+ return;
+ }
+
+ // first reaching def?
+ auto ItLastDef = DefMap.find(&SuccBlock);
+ if (ItLastDef == DefMap.end()) {
+ addPending(SuccBlock, DefBlock);
+ return;
+ }
+
+ // a join of at least two definitions
+ if (ItLastDef->second != &DefBlock) {
+ // do we know this join already?
+ if (!JoinBlocks->insert(&SuccBlock).second)
+ return;
+
+ // update the definition
+ addPending(SuccBlock, SuccBlock);
+ }
+ }
+
+ // find all blocks reachable by two disjoint paths from @rootTerm.
+ // This method works for both divergent terminators and loops with
+ // divergent exits.
+ // @rootBlock is either the block containing the branch or the header of the
+ // divergent loop.
+ // @nodeSuccessors is the set of successors of the node (MachineLoop or Terminator)
+ // headed by @rootBlock.
+ // @parentLoop is the parent loop of the MachineLoop or the loop that contains the
+ // Terminator.
+ template <typename SuccessorIterable>
+ std::unique_ptr<ConstBlockSet>
+ computeJoinPoints(const MachineBasicBlock &RootBlock,
+ SuccessorIterable NodeSuccessors, const MachineLoop *ParentLoop, const MachineBasicBlock * PdBoundBlock) {
+ assert(JoinBlocks);
+
+ // bootstrap with branch targets
+ for (const auto *SuccBlock : NodeSuccessors) {
+ DefMap.emplace(SuccBlock, SuccBlock);
+
+ if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
+ // immediate loop exit from node.
+ ReachedLoopExits.insert(SuccBlock);
+ continue;
+ } else {
+ // regular successor
+ PendingUpdates.insert(SuccBlock);
+ }
+ }
+
+ auto ItBeginRPO = FuncRPOT.begin();
+
+ // skip until term (TODO RPOT won't let us start at @term directly)
+ for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
+
+ auto ItEndRPO = FuncRPOT.end();
+ assert(ItBeginRPO != ItEndRPO);
+
+ // propagate definitions at the immediate successors of the node in RPO
+ auto ItBlockRPO = ItBeginRPO;
+ while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
+ const auto *Block = *ItBlockRPO;
+
+ // skip @block if not pending update
+ auto ItPending = PendingUpdates.find(Block);
+ if (ItPending == PendingUpdates.end())
+ continue;
+ PendingUpdates.erase(ItPending);
+
+ // propagate definition at @block to its successors
+ auto ItDef = DefMap.find(Block);
+ const auto *DefBlock = ItDef->second;
+ assert(DefBlock);
+
+ auto *BlockLoop = LI.getLoopFor(Block);
+ if (ParentLoop &&
+ (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) {
+ // if the successor is the header of a nested loop pretend its a
+ // single node with the loop's exits as successors
+ SmallVector<MachineBasicBlock *, 4> BlockLoopExits;
+ BlockLoop->getExitBlocks(BlockLoopExits);
+ for (const auto *BlockLoopExit : BlockLoopExits) {
+ visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock);
+ }
+
+ } else {
+ // the successors are either on the same loop level or loop exits
+ for (const auto *SuccBlock : Block->successors()) {
+ visitSuccessor(*SuccBlock, ParentLoop, *DefBlock);
+ }
+ }
+ }
+
+ // We need to know the definition at the parent loop header to decide
+ // whether the definition at the header is different from the definition at
+ // the loop exits, which would indicate a divergent loop exits.
+ //
+ // A // loop header
+ // |
+ // B // nested loop header
+ // |
+ // C -> X (exit from B loop) -..-> (A latch)
+ // |
+ // D -> back to B (B latch)
+ // |
+ // proper exit from both loops
+ //
+ // D post-dominates B as it is the only proper exit from the "A loop".
+ // If C has a divergent branch, propagation will therefore stop at D.
+ // That implies that B will never receive a definition.
+ // But that definition can only be the same as at D (D itself in thise case)
+ // because all paths to anywhere have to pass through D.
+ //
+ const MachineBasicBlock *ParentLoopHeader =
+ ParentLoop ? ParentLoop->getHeader() : nullptr;
+ if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
+ DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
+ }
+
+ // analyze reached loop exits
+ if (!ReachedLoopExits.empty()) {
+ assert(ParentLoop);
+ const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
+ LLVM_DEBUG(printDefs(dbgs()));
+
+ // AMDGPU CHANGE: Allow null HeaderDefBlock
+ // Because of the way they walk the blocks (a reverse post order traversal
+ // stopping at the immediate post dominator) it is possible that
+ // they will reach a loop exit, but not the loop header.
+ //
+ // We conservatively mark the exit blocks as divergent join points
+ // in this case.
+ //
+ // Problem CFG is below:
+ //
+ // +--> A
+ // | / \
+ // | B C
+ // | | / |
+ // +--L P
+ //
+ // In this cfg, C is the RootBlock and P is C's post-dominator.
+ // It will only visit L and P and then stop because it hits the
+ // post dominator. Most loops do not hit this case because the
+ // loop exiting block (C) will branch directly back to the loop
+ // header.
+ //
+ if (HeaderDefBlock)
+ {
+ for (const auto *ExitBlock : ReachedLoopExits) {
+ auto ItExitDef = DefMap.find(ExitBlock);
+ assert((ItExitDef != DefMap.end()) &&
+ "no reaching def at reachable loop exit");
+ if (ItExitDef->second != HeaderDefBlock) {
+ JoinBlocks->insert(ExitBlock);
+ }
+ }
+ }
+ else
+ {
+ for (const auto *ExitBlock : ReachedLoopExits)
+ {
+ JoinBlocks->insert(ExitBlock);
+ }
+ }
+ }
+
+ return std::move(JoinBlocks);
+ }
+};
+
+// AMDGPU change begin.
+// For all join blocks caused by divergent RootBlock, the prevs of a join block
+// which are in DefMap or the RootBlock are divergent join each other on the join block because
+// of divergent RootBlock.
+static void updateJoinMap(
+ const MachineBasicBlock *RootBlock,
+ DenseMap<const MachineBasicBlock *, SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
+ DivergencePropagator::DefiningBlockMap &DefMap, ConstBlockSet &JoinBlocks) {
+ for (const MachineBasicBlock *JoinBB : JoinBlocks) {
+ // makr divergent join for all pred pair which in DefMap.
+ for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end();
+ predIt++) {
+ auto predIt2 = predIt;
+ const MachineBasicBlock *pred = *predIt;
+ if (DefMap.count(pred) == 0 && pred != RootBlock)
+ continue;
+
+ for (predIt2++; predIt2 != JoinBB->pred_end(); predIt2++) {
+ const MachineBasicBlock *pred2 = *predIt2;
+ if (DefMap.count(pred2) == 0 && pred2 != RootBlock)
+ continue;
+
+ JoinMap[pred].insert(pred2);
+ JoinMap[pred2].insert(pred);
+ LLVM_DEBUG(dbgs() << "joint_bb0: " << pred->getName()
+ << " joint_bb1: " << pred2->getName() << "\n";);
+ }
+ }
+ }
+}
+// AMDGPU change end.
+
+const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
+ using LoopExitVec = SmallVector<MachineBasicBlock *, 4>;
+ LoopExitVec LoopExits;
+ MachineLoop.getExitBlocks(LoopExits);
+ if (LoopExits.size() < 1) {
+ return EmptyBlockSet;
+ }
+
+ // already available in cache?
+ auto ItCached = CachedLoopExitJoins.find(&MachineLoop);
+ if (ItCached != CachedLoopExitJoins.end()) {
+ return *ItCached->second;
+ }
+
+ // dont propagte beyond the immediate post dom of the loop
+ const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
+ const auto *IpdNode = PdNode->getIDom();
+ const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+ while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) {
+ IpdNode = IpdNode->getIDom();
+ PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+ }
+
+ // compute all join points
+ DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+ auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
+ *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), PdBoundBlock);
+
+ // AMDGPU change begin.
+ // Save divergent join pairs.
+ updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap,
+ *JoinBlocks.get());
+ // AMDGPU change end.
+
+ auto ItInserted = CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
+ assert(ItInserted.second);
+ return *ItInserted.first->second;
+}
+
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
+ // trivial case
+ if (Term.getParent()->succ_size() < 1) {
+ return EmptyBlockSet;
+ }
+
+ // already available in cache?
+ auto ItCached = CachedBranchJoins.find(&Term);
+ if (ItCached != CachedBranchJoins.end())
+ return *ItCached->second;
+
+ // dont propagate beyond the immediate post dominator of the branch
+ const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
+ const auto *IpdNode = PdNode->getIDom();
+ const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+
+
+ // compute all join points
+ DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+ const auto &TermBlock = *Term.getParent();
+
+ // AMDGPU CHANGE
+ // Make sure the post-dominator is outside the loop for the loop header.
+ // Otherwise, we may not find all the join blocks in the loop
+ // because the search stops too early. Some join points can be reached
+ // after the post-dominator!
+ //
+ // Problem CFG is below:
+ //
+ // +--> A
+ // | / \
+ // | B P
+ // | | / |
+ // +--L X
+ //
+ // In this cfg, A is the loop header and P is A's post-dominator.
+ // The algorithm to mark join points does an Reverse Post Order walk
+ // from A and stops when it reaches the post dominator. It would not
+ // mark the phi node in L as divergent even when A had a divergent branch.
+ // The fix we made was to make the join point search continue all the way
+ // to the loops post dominator (which is X in this example).
+ //
+ // NOTE: They already made this change for the loop case above, but for
+ // a different bug apparently. See SyncDependenceAnalysis::join_blocks(MachineLoop&)
+ //
+ const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock);
+ if (MachineLoop && (MachineLoop->getHeader() == &TermBlock))
+ {
+ while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
+ IpdNode = IpdNode->getIDom();
+ PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+ }
+ }
+
+ auto JoinBlocks = Propagator.computeJoinPoints(
+ TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock);
+
+ // AMDGPU change begin.
+ // Save divergent join pairs.
+ updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap,
+ *JoinBlocks.get());
+ // AMDGPU change end.
+
+ auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
+ assert(ItInserted.second);
+ return *ItInserted.first->second;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
new file mode 100644
index 000000000000000..a52bcc7bc9e7c51
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
@@ -0,0 +1,98 @@
+//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file defines the SyncDependenceAnalysis class, which computes for
+// every divergent branch the set of phi nodes that the branch will make
+// divergent.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <memory>
+#include <map>
+
+namespace llvm {
+class MachineBasicBlock;
+class MachineDominatorTree;
+class MachineLoop;
+class MachinePostDominatorTree;
+class MachineLoopInfo;
+class MachineFunction;
+class MachineInstr;
+
+using DivergentJoinMapTy =
+ llvm::DenseMap<const llvm::MachineBasicBlock *,
+ llvm::SmallPtrSet<const llvm::MachineBasicBlock *, 4>>;
+
+using ConstBlockSet = llvm::SmallPtrSet<const MachineBasicBlock *, 4>;
+
+/// \brief Relates points of divergent control to join points in
+/// reducible CFGs.
+///
+/// This analysis relates points of divergent control to points of converging
+/// divergent control. The analysis requires all loops to be reducible.
+class SyncDependenceAnalysis {
+ void visitSuccessor(const MachineBasicBlock &succBlock, const MachineLoop *termLoop,
+ const MachineBasicBlock *defBlock);
+
+public:
+ bool inRegion(const MachineBasicBlock &BB) const;
+
+ ~SyncDependenceAnalysis();
+ SyncDependenceAnalysis(const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+ const MachineLoopInfo &LI,
+ // AMDGPU change begin
+ DivergentJoinMapTy &JoinMap
+ // AMDGPU change end
+ );
+
+ /// \brief Computes divergent join points and loop exits caused by branch
+ /// divergence in \p Term.
+ ///
+ /// The set of blocks which are reachable by disjoint paths from \p Term.
+ /// The set also contains loop exits if there two disjoint paths:
+ /// one from \p Term to the loop exit and another from \p Term to the loop
+ /// header. Those exit blocks are added to the returned set.
+ /// If L is the parent loop of \p Term and an exit of L is in the returned
+ /// set then L is a divergent loop.
+ const ConstBlockSet &join_blocks(const MachineInstr &Term);
+
+ /// \brief Computes divergent join points and loop exits (in the surrounding
+ /// loop) caused by the divergent loop exits of\p MachineLoop.
+ ///
+ /// The set of blocks which are reachable by disjoint paths from the
+ /// loop exits of \p MachineLoop.
+ /// This treats the loop as a single node in \p MachineLoop's parent loop.
+ /// The returned set has the same properties as for join_blocks(TermInst&).
+ const ConstBlockSet &join_blocks(const MachineLoop &MachineLoop);
+
+private:
+ static ConstBlockSet EmptyBlockSet;
+
+ llvm::ReversePostOrderTraversal<const llvm::MachineFunction *> FuncRPOT;
+ const MachineDominatorTree &DT;
+ const MachinePostDominatorTree &PDT;
+ const MachineLoopInfo &LI;
+ // AMDGPU change begin.
+ DivergentJoinMapTy &DivergentJoinMap;
+ // AMDGPU change end.
+ std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+ std::map<const MachineInstr *, std::unique_ptr<ConstBlockSet>>
+ CachedBranchJoins;
+};
+
+} // namespace llvm
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
new file mode 100644
index 000000000000000..648df7f724617f4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -0,0 +1,188 @@
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===--------------------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+namespace llvm {
+
+// Other info which can help compare schedule result.
+float SchedScore::computeScore() const {
+ // Occupancy 1 cannot mix alu.
+ unsigned MixHidenAlu = Alu - MixAlu;
+ if (Occupancy == 1)
+ MixHidenAlu = 0;
+ return ((float)MemLatency - (float)MixHidenAlu) / (float)Occupancy -
+ LatencyHide;
+}
+float SchedScore::computeScore2() const {
+ float cycles = 0;
+ cycles = (MixAlu * Occupancy + MemLatency);
+ cycles /= Occupancy;
+ return cycles;
+}
+
+void SchedScore::sum(const SchedScore &s, unsigned loopDepth) {
+ unsigned loopCount = loopDepth > 0 ? std::pow(3, loopDepth) : 1;
+ LatencyHide += loopCount * s.LatencyHide;
+ MemLatency += loopCount * s.MemLatency;
+ MixAlu += loopCount * s.MixAlu;
+ Alu += loopCount * s.Alu;
+ Lds += loopCount * s.Lds;
+ SgprSpill |= s.SgprSpill;
+}
+bool SchedScore::isBetter(const SchedScore &s) const {
+ float score = computeScore();
+ float newScore = s.computeScore();
+ bool spillBetter = !SgprSpill && s.SgprSpill;
+ return spillBetter ? true : newScore >= score;
+}
+// Does more occupancy give more perf.
+bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
+ unsigned gain = latencyGain(TargetOccupancy, ExtraOcc);
+ // 10% is good enough.
+ if ((10*gain) >= Alu)
+ return true;
+ else
+ return false;
+}
+
+unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
+ unsigned latency = MemLatency;
+ return (latency / (TgtOcc))- (latency / (TgtOcc + ExtraOcc));
+}
+
+// AMDGPULatencyTracker
+AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
+ : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
+
+void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
+ if (MI.isDebugInstr()) return;
+ int latency = SIII->getInstrLatency(ItinerayData, MI);
+ // If inside latency hide.
+ if (!LatencyMIs.empty()) {
+ bool bWaitCnt = false;
+ for (auto &MO : MI.operands()) {
+ if (MO.isReg()) {
+ unsigned reg = MO.getReg();
+ auto it = LatencyMIs.find(reg);
+ if (it != LatencyMIs.end()) {
+ bWaitCnt = true;
+ // If MI use mem result, update latency to mem latency.
+ int cycle = it->second;
+ if (cycle > latency)
+ latency = cycle;
+ }
+ }
+ }
+ // Update latency for each mem latency inst.
+ for (auto it = LatencyMIs.begin(); it != LatencyMIs.end();) {
+ auto prev = it;
+ auto l = (it++);
+ int cycle = l->second;
+ if (cycle <= latency) {
+ // Only left cycles.
+ // Remove the reg.
+ LatencyMIs.erase(prev);
+ if (bWaitCnt && cycle == latency) {
+ score.MemLatency += cycle;
+ // Only count memLatency once, the rest is hide.
+ bWaitCnt = false;
+ } else {
+ // Hide cycle or count mem latency?
+ score.LatencyHide += cycle;
+ }
+ } else {
+ l->second -= latency;
+ // Hide latency.
+ score.LatencyHide += latency;
+ }
+ }
+
+ } else {
+ // TODO: check branch/lds?
+ // TODO: check prevVAlu?
+ auto getAluStatus = [](const MachineInstr &MI,
+ const llvm::SIInstrInfo *SIII) {
+ AluStatus status = AluStatus::Nothing;
+ if (SIII->isVALU(MI.getOpcode())) {
+ status = AluStatus::Vector;
+ } else if (SIII->isSALU(MI.getOpcode())) {
+ status = AluStatus::Scalar;
+ }
+ return status;
+ };
+ AluStatus status = getAluStatus(MI, SIII);
+
+ switch (prevStatus) {
+ case AluStatus::Nothing: {
+ score.Alu += latency;
+ score.MixAlu += latency;
+ prevStatus = status;
+ } break;
+ case AluStatus::Vector:
+ case AluStatus::Scalar: {
+ score.Alu += latency;
+ // Ignore mix alu.
+ if (prevStatus != status) {
+ prevStatus = AluStatus::Nothing;
+ } else {
+ score.MixAlu += latency;
+ }
+ } break;
+ }
+ }
+ // Update latency inst.
+ if (SIII->isHighLatencyInstruction(MI) && MI.mayLoad()) {
+ unsigned reg = MI.getOperand(0).getReg();
+ // TODO: get correct latency.
+ // SIII->getInstrLatency(ItinerayData, MI);
+ constexpr unsigned kHighLetency = 180;
+ LatencyMIs[reg] = kHighLetency;
+ } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) {
+ unsigned reg = MI.getOperand(0).getReg();
+ // TODO: get correct latency.
+ // SIII->getInstrLatency(ItinerayData, MI);
+ constexpr unsigned kLowLetency = 35;
+ LatencyMIs[reg] = kLowLetency;
+ }
+}
+
+SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
+ const llvm::MachineLoopInfo *MLI) {
+ SchedScore totalScore;
+ for (auto &MFI : MF) {
+ MachineBasicBlock &MBB = MFI;
+ MachineBasicBlock::iterator Next;
+ AMDGPULatencyTracker latencyTracker(ST);
+ for (auto &MI : MBB) {
+ latencyTracker.scan(MI);
+ }
+ unsigned loopDepth = 0;
+ if (MLI) {
+ loopDepth = MLI->getLoopDepth(&MBB);
+ }
+ totalScore.sum(latencyTracker.score, loopDepth);
+ }
+ return totalScore;
+}
+
+} // namespace llvm
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
new file mode 100644
index 000000000000000..f108bab24bd3907
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -0,0 +1,74 @@
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===--------------------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class MachineFunction;
+class GCNSubtarget;
+class MachineInstr;
+class SIInstrInfo;
+class MachineLoopInfo;
+
+struct SchedScore {
+ // Score for this Sched result.
+ unsigned Occupancy = 0;
+ bool SgprSpill = false;
+ unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass?
+ unsigned MemLatency = 0; // Only save mem latency.
+ // We want mem latency small and hide big. Compare
+ // memLatency - hide * Occ, smaller is better.
+ unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1.
+ unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
+ unsigned Lds = 0; // Todo: count lds.
+ SchedScore() {}
+
+ // Other info which can help compare schedule result.
+ float computeScore() const;
+ float computeScore2() const;
+
+ void sum(const SchedScore &s, unsigned loopDepth=0);
+ bool isBetter(const SchedScore &s) const;
+ bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc=1) const;
+ // More latency can be hiden with ExtraOcc.
+ unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
+};
+
+struct AMDGPULatencyTracker {
+ AMDGPULatencyTracker(const llvm::GCNSubtarget &ST);
+ const llvm::SIInstrInfo *SIII;
+ const llvm::InstrItineraryData *ItinerayData;
+ // Latency MI dst reg to cycle map.
+ llvm::DenseMap<unsigned, int> LatencyMIs;
+ SchedScore score;
+ // Low latency MI not wait.
+ unsigned hideLatency = 0;
+ unsigned memLatency = 0;
+ // For simple, only consider mixture as one valu one salu.
+ // Not group now.
+ unsigned prevSAlu = 0;
+ unsigned prevVAlu = 0;
+ enum class AluStatus {
+ Nothing,
+ Vector,
+ Scalar,
+ } prevStatus = AluStatus::Nothing;
+ void scan(const llvm::MachineInstr &MI);
+};
+
+SchedScore CollectLatency(llvm::MachineFunction &MF,
+ const llvm::GCNSubtarget &ST,
+ const llvm::MachineLoopInfo *MLI = nullptr);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
new file mode 100644
index 000000000000000..a0f2a5d4dc121b5
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -0,0 +1,1790 @@
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+//#include "dxc/DXIL/DxilMetadataHelper.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Support/GraphWriter.h"
+
+#include "llvm/Support/Debug.h"
+
+#include "GCNRegPressure.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUSubExpDag.h"
+#include <unordered_set>
+
+#define DEBUG_TYPE "xb-sub-exp-dag"
+using namespace llvm;
+
+namespace llvm {
+
+// Expression Dag.
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const {
+ dbgs() << "\nSubExp:\n";
+ dbgs() << "input regs:\n";
+ for (auto &input : inputLive) {
+ pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
+ dbgs() << "\n";
+ }
+ dbgs() << "output regs:\n";
+ for (auto &output : outputLive) {
+ pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
+ dbgs() << "\n";
+ }
+
+ for (MachineInstr *MI : SUnits) {
+ MI->dump();
+ }
+ dbgs() << "End of SubExp\n";
+}
+#endif
+
+bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo* SIRI) const
+{
+ for (const MachineInstr *MI : SUnits)
+ {
+ if (MI->modifiesRegister(Reg, SIRI))
+ {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
+ sMaxSize = std::max(sInputSize, sOutputSize);
+ vMaxSize = std::max(vInputSize, vOutputSize);
+
+ DenseMap<unsigned, LaneBitmask> LiveRegs;
+ GCNRegPressure CurPressure;
+
+ // Add output to pressure.
+ for (MachineInstr *MI : BottomRoots) {
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg.isVirtual())
+ continue;
+ LaneBitmask mask = getRegMask(MO, MRI);
+ auto it = LiveRegs.find(Reg);
+ if (it != LiveRegs.end()) {
+ LiveRegs[Reg] = mask | it->second;
+ } else {
+ LiveRegs[Reg] = mask;
+ }
+ }
+ }
+
+ for (auto it : LiveRegs) {
+ LaneBitmask emptyMask;
+ CurPressure.inc(it.first, emptyMask, it.second, MRI);
+ }
+
+ for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) {
+ MachineInstr *MI = *it;
+ auto *ST = &MI->getMF()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg.isVirtual()) {
+ if (Reg == AMDGPU::SCC)
+ bTouchSCC = true;
+ continue;
+ }
+
+ LaneBitmask LiveMask = getRegMask(MO, MRI);
+ LaneBitmask PrevMask;
+ auto liveIt = LiveRegs.find(Reg);
+ if (liveIt != LiveRegs.end()) {
+ PrevMask = liveIt->second;
+ }
+
+ if (MO.isDef()) {
+ LiveMask = PrevMask & (~(LiveMask));
+ } else {
+ LiveMask = PrevMask | LiveMask;
+ }
+
+ CurPressure.inc(Reg, PrevMask, LiveMask, MRI);
+ LiveRegs[Reg] = LiveMask;
+ }
+
+ unsigned sSize = CurPressure.getSGPRNum();
+ unsigned vSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts());
+ if (sSize > sMaxSize)
+ sMaxSize = sSize;
+ if (vSize > vMaxSize)
+ vMaxSize = vSize;
+ }
+}
+
+bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const {
+ if (bMultiDefOutput)
+ return false;
+ if (bHasTerminatorInst)
+ return false;
+ if (bUseIncomingReg)
+ return false;
+
+ // Input should be single def.
+ for (unsigned Reg : TopRegs) {
+ if (!MRI.hasOneDef(Reg) && !llvm::IsSub0Sub1SingleDef(Reg, MRI))
+ return false;
+ }
+ return true;
+}
+
+ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, const bool bJoinInput)
+ : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {}
+
+template <typename T>
+void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) {
+ unsigned NodeSize = InputLiveReg.size() + insts.size();
+ SUnits.reserve(NodeSize);
+
+ for (MachineInstr *MI : insts) {
+ if (MI->isDebugInstr())
+ continue;
+ SUnits.emplace_back(MI, SUnits.size());
+ SUnit *SU = &SUnits.back();
+ SUnitMIMap[SU] = MI;
+ MISUnitMap[MI] = SU;
+ }
+
+ for (auto it : InputLiveReg) {
+ unsigned Reg = it.first;
+ SUnits.emplace_back();
+ SUnit *SU = &SUnits.back();
+ SU->NodeNum = SUnits.size() - 1;
+ SUnitInputMap[SU] = Reg;
+ InputSUnitMap[Reg] = SU;
+ }
+}
+
+template void ExpDag::initNodes<DenseSet<MachineInstr *>>(
+ const LiveSet &InputLiveReg, DenseSet<MachineInstr *> &instRange);
+
+template void ExpDag::initNodes<std::vector<MachineInstr *>>(
+ const LiveSet &InputLiveReg, std::vector<MachineInstr *> &instRange);
+
+template <typename T>
+void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+ T &insts) {
+ initNodes(InputLiveReg, insts);
+ addDataDep(SIRI);
+ addCtrlDep();
+ buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII);
+}
+
+template void
+ExpDag::build<DenseSet<MachineInstr *>>(const LiveSet &InputLiveReg,
+ const LiveSet &OutputLiveReg,
+ DenseSet<MachineInstr *> &instRange);
+template void ExpDag::build<std::vector<MachineInstr *>>(const LiveSet &InputLiveReg,
+ const LiveSet &OutputLiveReg,
+ std::vector<MachineInstr *> &instRange);
+
+void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ IntEqClasses SubtreeClasses(SUnits.size());
+ std::vector<unsigned> passThruInputs;
+ for (SUnit &SU : SUnits) {
+ if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) {
+ passThruInputs.emplace_back(SU.NodeNum);
+ continue;
+ }
+ if (!bJoinInputToSubExp && !SU.isInstr())
+ continue;
+ // Join prev.
+ for (SDep &PreDep : SU.Preds) {
+ SUnit *PreSU = PreDep.getSUnit();
+ if (!bJoinInputToSubExp && !PreSU->isInstr())
+ continue;
+ SubtreeClasses.join(SU.NodeNum, PreSU->NodeNum);
+ }
+ // Join succ.
+ for (SDep &SucDep : SU.Succs) {
+ SUnit *SucSU = SucDep.getSUnit();
+ SubtreeClasses.join(SU.NodeNum, SucSU->NodeNum);
+ }
+ }
+ SubtreeClasses.compress();
+
+ unsigned NumSubExps = SubtreeClasses.getNumClasses();
+ // Not count passThruInputs for subExps since they're exp with only 1 SU.
+ // SubExpIndexMap is used to pack SubIdx within updated NumSubExps.
+ NumSubExps -= passThruInputs.size();
+ SubExps.resize(NumSubExps);
+ DenseMap<unsigned, unsigned> SubExpIndexMap;
+
+ // Add SU to sub exp.
+ for (SUnit &SU : SUnits) {
+ if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) {
+ continue;
+ }
+ unsigned SubIdx = SubtreeClasses[SU.NodeNum];
+ unsigned OriginSubIdx = SubIdx;
+ // Pack subidx.
+ if (SubExpIndexMap.count(SubIdx) == 0) {
+ unsigned count = SubExpIndexMap.size();
+ SubExpIndexMap.insert(std::make_pair(SubIdx, count));
+ }
+ SubIdx = SubExpIndexMap[SubIdx];
+ // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag.
+ SU.NodeQueueId = SubIdx;
+
+ SubExp &Exp = SubExps[SubIdx];
+ auto it = SUnitInputMap.find(&SU);
+ if (it != SUnitInputMap.end()) {
+ // Input.
+ unsigned Reg = it->second;
+ Exp.TopRegs.insert(Reg);
+ } else {
+ MachineInstr *MI = SU.getInstr();
+ MachineBasicBlock *MBB = MI->getParent();
+ Exp.FromBB = MBB;
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isUse())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) {
+ Exp.bUseIncomingReg = true;
+ }
+ }
+
+ Exp.SUnits.emplace_back(MI);
+ if (SU.NumSuccsLeft == 0) {
+ Exp.BottomRoots.insert(MI);
+ if (MI->isTerminator())
+ Exp.bHasTerminatorInst = true;
+ }
+ if (MI->isNotDuplicable())
+ Exp.bNotSafeToCopy = true;
+ // Skip Scalar mem access since no scalar store.
+ if (MI->mayLoadOrStore() && !SIII->isSMRD(*MI)) {
+ Exp.bHasMemInst = true;
+ }
+ // Add bottom regs.
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ // physical reg is not in live reg.
+ if (!Reg.isVirtual())
+ continue;
+ if (SU.NumSuccsLeft) {
+ // For SU which has used in current blk.
+ // Check if used in other blks or subExps.
+ bool bUsedInOtherBlk = false;
+ for (auto &UserMI : MRI.use_nodbg_instructions(Reg)) {
+ if (UserMI.getParent() != MBB) {
+ bUsedInOtherBlk = true;
+ break;
+ }
+ auto suIt = MISUnitMap.find(&UserMI);
+ // When UserMI is not in dag, treat it as other block.
+ if (suIt == MISUnitMap.end()) {
+ bUsedInOtherBlk = true;
+ break;
+ }
+ SUnit *UseSU = suIt->second;
+ // UserMI should always be in same subExp.
+ unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum];
+ if (UseSubIdx != OriginSubIdx) {
+ // When reg has multiple def, it is possible for user def in different subExp.
+ if (MRI.getUniqueVRegDef(Reg))
+ llvm::report_fatal_error("user and def in different subExp");
+ break;
+ }
+ }
+ if (!bUsedInOtherBlk)
+ continue;
+ }
+ Exp.BottomRegs.insert(Reg);
+ if (!MRI.getUniqueVRegDef(Reg)) {
+ Exp.bMultiDefOutput = true;
+ }
+ }
+ }
+ }
+ // Calc reg for SubExp.
+ // Get block live in and live out.
+ // Only reg will miss live mask.
+ for (SubExp &Exp : SubExps) {
+ for (unsigned Reg : Exp.TopRegs) {
+ auto it = StartLiveReg.find(Reg);
+ assert(it != StartLiveReg.end() &&
+ "cannot find input reg in block start live");
+ Exp.inputLive[Reg] |= it->second;
+ }
+
+ for (unsigned Reg : Exp.BottomRegs) {
+ auto it = EndLiveReg.find(Reg);
+ if (it == EndLiveReg.end()) {
+ //"cannot find output reg in block end live");
+ // Bottom reg is killed inside current block, did not get out of the
+ // block.
+ // Or the bottom reg is not treat as output in this dag, not save to
+ // outputLive which will affect profit count.
+ continue;
+ }
+ Exp.outputLive[Reg] |= it->second;
+ }
+
+ CollectLiveSetPressure(Exp.inputLive, MRI, SIRI, Exp.vInputSize,
+ Exp.sInputSize);
+ CollectLiveSetPressure(Exp.outputLive, MRI, SIRI, Exp.vOutputSize,
+ Exp.sOutputSize);
+ }
+}
+
+void ExpDag::addDataDep(const SIRegisterInfo *SIRI) {
+ DenseMap<unsigned, MachineInstr *> curDefMI;
+
+ for (SUnit &SU : SUnits) {
+ if (!SU.isInstr())
+ continue;
+ MachineInstr *MI = SU.getInstr();
+
+ // Link use to the def.
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ continue;
+
+ Register Reg = MO.getReg();
+ SUnit *DefSU = nullptr;
+
+ auto curDefIt = curDefMI.find(Reg);
+ // Check def inst first.
+ if (curDefIt != curDefMI.end()) {
+ MachineInstr *curDef = curDefIt->second;
+ DefSU = MISUnitMap[curDef];
+ } else {
+ // physical reg is not in live reg.
+ if (!Reg.isVirtual())
+ continue;
+ if (MO.isUndef())
+ continue;
+ // Is it OK for degbug instr MO cannot find def?
+ if (MI->isDebugInstr())
+ continue;
+ // Should be an input.
+ assert(InputSUnitMap.count(Reg) > 0 && "cannot find def");
+ DefSU = InputSUnitMap[Reg];
+ }
+ SU.addPred(SDep(DefSU, SDep::Data, Reg));
+ }
+
+ // Add def to curDefMI;
+ for (MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isDef())
+ continue;
+ unsigned Reg = MO.getReg();
+
+ // For case like:
+ // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+ // %808.sub1:sgpr_64 = S_MOV_B32 0
+ // When partially write, link MI to previous def.
+ if (MO.getSubReg() != 0) {
+ SUnit *DefSU = nullptr;
+ auto curDefIt = curDefMI.find(Reg);
+ // Check def inst first.
+ if (curDefIt != curDefMI.end()) {
+ MachineInstr *curDef = curDefIt->second;
+ DefSU = MISUnitMap[curDef];
+ // Add link between different defs.
+ SU.addPred(SDep(DefSU, SDep::Data, Reg));
+ }
+ }
+
+ curDefMI[Reg] = MI;
+ }
+ }
+}
+
+void ExpDag::addCtrlDep() {
+ // TODO: add depend for memory, barrier.
+}
+
+BlockExpDag::BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI,
+ const llvm::SIInstrInfo *SIII)
+ : ExpDag(MRI, SIRI, SIII, /*bJoinInput*/ true), LIS(LIS), MBB(B) {}
+
+void BlockExpDag::build() {
+ auto *SlotIndexes = LIS->getSlotIndexes();
+ const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB);
+ const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI);
+
+ const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB);
+ const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI);
+
+ std::vector<MachineInstr *> insts;
+ for (MachineInstr &MI : *MBB) {
+ insts.emplace_back(&MI);
+ }
+
+ ExpDag::build(StartLiveReg, EndLiveReg, insts);
+}
+
+void BlockExpDag::buildWithPressure() {
+ auto *SlotIndexes = LIS->getSlotIndexes();
+ const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB);
+ const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI);
+
+ const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB);
+ const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI);
+
+ std::vector<MachineInstr *> insts;
+ for (MachineInstr &MI : *MBB) {
+ insts.emplace_back(&MI);
+ }
+
+ ExpDag::build(StartLiveReg, EndLiveReg, insts);
+ // Build pressure.
+ buildPressure(StartLiveReg, EndLiveReg);
+}
+
+void BlockExpDag::buildAvail(
+ const LiveSet &passThruSet,
+ DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
+ DenseSet<SUnit *> Processed;
+
+ DenseSet<SUnit *> WorkList;
+ MachineInstr &BeginMI = MBB->instr_front();
+
+ // Calc avaialbe for each node, live is avail & sum(input of success).
+ // If a reg is avaiable from the node, then success node can use it from this
+ // node. For dag live, pred output don't need to have all input a node needs.
+ // As long as all pred outputs can cover inputs, it is OK.
+ for (SUnit &SU : SUnits) {
+ if (SU.NumPredsLeft == 0) {
+ GCNDownwardRPTracker RP(*LIS);
+ RP.reset(BeginMI, &passThruSet);
+ MachineInstr *MI = SU.getInstr();
+ if (MI) {
+ RP.reset(*MI, &passThruSet);
+ RP.advance();
+ }
+ DagAvailRegMap[&SU] = RP.getLiveRegs();
+
+ // Add succ to work list.
+ for (auto &Succ : SU.Succs) {
+ SUnit *SuccSU = Succ.getSUnit();
+ if (SuccSU->NumPredsLeft > 0)
+ SuccSU->NumPredsLeft--;
+ WorkList.insert(SuccSU);
+ }
+ }
+ }
+ while (!WorkList.empty()) {
+ bool bUpdated = false;
+ SmallVector<SUnit *, 4> ReadyNodes;
+ for (SUnit *SU : WorkList) {
+ if (SU->NumPredsLeft > 0)
+ continue;
+ ReadyNodes.emplace_back(SU);
+ // Ready, move it to Processed.
+ Processed.insert(SU);
+ bUpdated = true;
+ // Only update 1 node once.
+ // Order of schedle here should not affect pressure.
+ break;
+ }
+
+ for (SUnit *SU : ReadyNodes) {
+ // Remove SU from worklist.
+ WorkList.erase(SU);
+
+ MachineInstr *MI = SU->getInstr();
+ // Calc pressure based on pred nodes.
+ GCNRPTracker::LiveRegSet dagLive;
+ for (auto &Pred : SU->Preds) {
+ SUnit *PredSU = Pred.getSUnit();
+ GCNRPTracker::LiveRegSet PredLive = DagAvailRegMap[PredSU];
+
+ GCNDownwardRPTracker RP(*LIS);
+ RP.reset(BeginMI, &PredLive);
+ if (MI) {
+ RP.reset(*MI, &PredLive);
+ // Update PredLive based on MI.
+ RP.advance();
+ }
+ llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs());
+ }
+ DagAvailRegMap[SU] = dagLive;
+
+ // Add succ to work list.
+ for (auto &Succ : SU->Succs) {
+ SUnit *SuccSU = Succ.getSUnit();
+ if (SuccSU->NumPredsLeft > 0)
+ SuccSU->NumPredsLeft--;
+ WorkList.insert(SuccSU);
+ }
+ }
+
+ // Skip dead loop
+ if (ReadyNodes.empty()) {
+ printf("dead loop when build dag pressure");
+ break;
+ }
+ }
+
+ assert(WorkList.empty() && "schedule failed for available reg");
+}
+
+void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
+ const LiveSet &EndLiveReg) {
+ if (MBB->empty())
+ return;
+ DenseMap<SUnit *, GCNRPTracker::LiveRegSet> DagAvailRegMap;
+ GCNRPTracker::LiveRegSet passThruSet;
+ for (auto Reg : StartLiveReg) {
+ unsigned reg = Reg.first;
+ auto EndReg = EndLiveReg.find(reg);
+ if (EndReg == EndLiveReg.end())
+ continue;
+
+ LaneBitmask mask = Reg.second;
+ LaneBitmask endMask = EndReg->second;
+ mask &= endMask;
+ if (mask.getAsInteger() == 0)
+ continue;
+ passThruSet[reg] = mask;
+ }
+
+ // Build avial for each nodes.
+ buildAvail(passThruSet, DagAvailRegMap);
+
+ // Calc avaialbe for each node, live is avail & sum(input of success).
+ // If a reg is avaiable from the node, then success node can use it from this
+ // node. For dag live, pred output don't need to have all input a node needs.
+ // As long as all pred outputs can cover inputs, it is OK.
+ DenseSet<SUnit *> Processed;
+
+ DenseSet<SUnit *> WorkList;
+ MachineInstr &BeginMI = MBB->instr_front();
+
+ for (SUnit &SU : SUnits) {
+ if (SU.NumSuccsLeft == 0) {
+ // Calc pressure based on pass thru.
+ // Using pass thru as base because output of current SU should not
+ // affect other output SUs.
+ GCNUpwardRPTracker RP(*LIS);
+ RP.reset(BeginMI, &passThruSet, /*After*/true);
+ MachineInstr *MI = SU.getInstr();
+ if (MI) {
+ RP.reset(*MI, &passThruSet, /*After*/true);
+ RP.recede(*MI);
+ }
+ DagPressureMap[&SU] = RP.getLiveRegs();
+ // Add pred to work list.
+ for (auto &Pred : SU.Preds) {
+ SUnit *PredSU = Pred.getSUnit();
+ PredSU->NumSuccsLeft--;
+ WorkList.insert(PredSU);
+ }
+ }
+ }
+
+ while (!WorkList.empty()) {
+ bool bUpdated = false;
+ SmallVector<SUnit *, 4> ReadyNodes;
+ for (SUnit *SU : WorkList) {
+ if (SU->NumSuccsLeft > 0)
+ continue;
+ ReadyNodes.emplace_back(SU);
+ // Ready, move it to Processed.
+ Processed.insert(SU);
+ bUpdated = true;
+ // Only update 1 node once.
+ // Order of schedle here should not affect pressure.
+ break;
+ }
+
+ for (SUnit *SU : ReadyNodes) {
+ // Remove SU from worklist.
+ WorkList.erase(SU);
+
+ MachineInstr *MI = SU->getInstr();
+ // Calc pressure based on succ nodes.
+ GCNRPTracker::LiveRegSet dagLive;
+ for (auto &Succ : SU->Succs) {
+ SUnit *SuccSU = Succ.getSUnit();
+ GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU];
+
+ GCNUpwardRPTracker RP(*LIS);
+ RP.reset(BeginMI, &SuccLive, /*After*/true);
+ if (MI) {
+ RP.reset(*MI, &SuccLive, /*After*/true);
+ // Update SuccLive based on MI.
+ RP.recede(*MI);
+ }
+ llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs());
+ }
+ // Remove live which not avail in SU.
+ GCNRPTracker::LiveRegSet availLive = DagAvailRegMap[SU];
+ llvm::andLiveRegSet(dagLive, availLive);
+ DagPressureMap[SU] = dagLive;
+
+ // Add pred to work list.
+ for (auto &Pred : SU->Preds) {
+ SUnit *PredSU = Pred.getSUnit();
+ PredSU->NumSuccsLeft--;
+ WorkList.insert(PredSU);
+ }
+ }
+
+ // Skip dead loop
+ if (ReadyNodes.empty()) {
+ printf("dead loop when build dag pressure");
+ break;
+ }
+ }
+}
+
+// dump functions.
+
+std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const {
+ std::string s;
+ raw_string_ostream oss(s);
+ auto it = SUnitInputMap.find(SU);
+ if (it != SUnitInputMap.end()) {
+ oss << "<input:" << llvm::printReg(it->second) << ">";
+ } else {
+ SU->getInstr()->print(oss, /*SkipOpers=*/true);
+ }
+
+ return oss.str();
+}
+
+/// Return the label.
+std::string ExpDag::getDAGName() const {
+ return "dag.exp";
+}
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const {
+#if 0 // TODO: Re-enable this
+ // This code is only for debugging!
+#ifndef NDEBUG
+ ViewGraph(const_cast<ExpDag *>(this), Name, false, Title);
+#else
+ errs() << "BlockExpDag::viewGraph is only available in debug builds on "
+ << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+#endif
+}
+
+void ExpDag::dump() {
+ viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName());
+}
+
+}
+
+// Expression Dag dump.
+namespace llvm {
+
+static DenseSet<const SUnit *> ViewNodes;
+
+template <>
+struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
+
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+ static std::string getGraphName(const llvm::ExpDag *G) {
+ return "ExpDag graph";
+ }
+
+ static bool renderGraphFromBottomUp() { return true; }
+
+ static bool isNodeHidden(const SUnit *Node) {
+ if (ViewNodes.empty())
+ return false;
+
+ return ViewNodes.count(Node) == 0;
+ }
+
+ static std::string getNodeIdentifierLabel(const SUnit *Node,
+ const llvm::ExpDag *Graph) {
+ std::string R;
+ raw_string_ostream OS(R);
+ OS << static_cast<const void *>(Node);
+ return R;
+ }
+
+ /// If you want to override the dot attributes printed for a particular
+ /// edge, override this method.
+ static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
+ const llvm::ExpDag *Graph) {
+ if (EI.isArtificialDep())
+ return "color=cyan,style=dashed";
+ if (EI.isCtrlDep())
+ return "color=blue,style=dashed";
+ return "";
+ }
+
+ static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *Graph) {
+ std::string Str;
+ raw_string_ostream SS(Str);
+ SS << "SU:" << SU->NodeNum;
+ return SS.str();
+ }
+ static std::string getNodeDescription(const SUnit *SU, const llvm::ExpDag *G) {
+ return G->getGraphNodeLabel(SU);
+ }
+ static std::string getNodeAttributes(const SUnit *N,
+ const llvm::ExpDag *Graph) {
+ std::string Str("shape=Mrecord");
+
+ Str += ",style=filled,fillcolor=\"#";
+ // Use NodeQueueId as SubIdx for ExpDag.
+ Str += DOT::getColorString(N->NodeQueueId);
+ Str += '"';
+
+ return Str;
+ }
+
+ static void addCustomGraphFeatures(llvm::ExpDag *G,
+ GraphWriter<llvm::ExpDag *> &GW) {
+ return G->addCustomGraphFeatures(GW);
+ }
+};
+
+template <> struct GraphTraits<llvm::ExpDag *> : public GraphTraits<SUnit *> {
+ using nodes_iterator = pointer_iterator<std::vector<SUnit>::iterator>;
+ static nodes_iterator nodes_begin(llvm::ExpDag *G) {
+ return nodes_iterator(G->SUnits.begin());
+ }
+ static nodes_iterator nodes_end(llvm::ExpDag *G) {
+ return nodes_iterator(G->SUnits.end());
+ }
+};
+
+} // namespace llvm
+
+namespace llvm {
+void getRegBound(llvm::MachineBasicBlock *MBB,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+ llvm::LiveIntervals *LIS, unsigned &MaxVGPR,
+ unsigned &MaxSGPR) {
+ // TODO: calc real reg bound.
+ MaxVGPR = AMDGPU::VGPR255 - AMDGPU::VGPR0;
+ MaxSGPR = AMDGPU::SGPR104 - AMDGPU::SGPR0;
+
+ const auto &EndSlot = LIS->getMBBEndIdx(MBB);
+ const GCNRPTracker::LiveRegSet outputLive =
+ llvm::getLiveRegs(EndSlot, *LIS, MRI);
+
+ auto* ST = &MBB->getParent()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+ if (MBB->empty()) {
+ GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive);
+ MaxSGPR = MaxPressure.getSGPRNum();
+ MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts());
+ return;
+ }
+
+ BlockExpDag dag(MBB, LIS, MRI, SIRI, SIII);
+ dag.build();
+
+ std::vector<SUnit> &SUnits = dag.SUnits;
+ // Remove input nodes.
+ for (SUnit &SU : SUnits) {
+ if (!SU.isInstr())
+ continue;
+ std::vector<SDep> inputDeps;
+ for (SDep &Dep : SU.Preds) {
+ SUnit *Pred = Dep.getSUnit();
+ if (Pred->isInstr())
+ continue;
+ inputDeps.emplace_back(Dep);
+ }
+ for (SDep &Dep : inputDeps) {
+ SU.removePred(Dep);
+ }
+ }
+
+ unsigned inputSize = dag.InputSUnitMap.size();
+ unsigned instNodeSize = SUnits.size() - inputSize;
+ SUnits.erase(SUnits.begin() + instNodeSize, SUnits.end());
+
+ std::vector<llvm::SUnit *> BotRoots;
+ for (SUnit &SU : SUnits) {
+ if (SU.NumSuccsLeft == 0)
+ BotRoots.emplace_back(&SU);
+ }
+
+ auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI);
+
+ GCNUpwardRPTracker RPTracker(*LIS);
+ RPTracker.reset(MBB->front(), &outputLive, /*After*/true);
+ for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) {
+ const SUnit *SU = *it;
+ if (!SU->isInstr())
+ continue;
+ MachineInstr *MI = SU->getInstr();
+ RPTracker.recede(*MI);
+ }
+
+ GCNRegPressure MaxPressure = RPTracker.getMaxPressureAndReset();
+ MaxSGPR = MaxPressure.getSGPRNum();
+ MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts());
+}
+} // namespace llvm
+
+// HRB
+namespace {
+
+std::vector<SUnit *> buildWorkList(std::vector<llvm::SUnit> &SUnits) {
+ std::vector<SUnit *> resultList;
+ resultList.reserve(SUnits.size());
+ for (SUnit &SU : SUnits) {
+ resultList.emplace_back(&SU);
+ }
+ return resultList;
+}
+
+void sortByHeight(std::vector<SUnit *> &workList) {
+ std::sort(workList.begin(), workList.end(),
+ [](const SUnit *a, const SUnit *b) {
+ // Lowest height first.
+ if (a->getHeight() < b->getHeight())
+ return true;
+ // If height the same, NodeNum big first.
+ if (a->getHeight() == b->getHeight())
+ return a->NodeNum > b->NodeNum;
+ return false;
+ });
+}
+
+void sortByInChain(std::vector<SUnit *> &workList, DenseSet<SUnit *> &Chained) {
+ // In chain nodes at end.
+ std::sort(workList.begin(), workList.end(),
+ [&Chained](const SUnit *a, const SUnit *b) {
+ return Chained.count(a) < Chained.count(b);
+ });
+}
+
+const TargetRegisterClass *getRegClass(SUnit *SU,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI) {
+ if (!SU->isInstr())
+ return nullptr;
+ MachineInstr *MI = SU->getInstr();
+ if (MI->getNumDefs() == 0)
+ return nullptr;
+
+ // For MI has more than one dst, always use first dst.
+ MachineOperand *MO = MI->defs().begin();
+ if (!MO->isReg())
+ return nullptr;
+ unsigned Reg = MO->getReg();
+ return SIRI->getRegClassForReg(MRI, Reg);
+}
+
+unsigned getVGPRSize(const TargetRegisterClass *RC,
+ const SIRegisterInfo *SIRI) {
+ if (!RC)
+ return 0;
+ if (SIRI->isSGPRClass(RC))
+ return 0;
+ return RC->getLaneMask().getNumLanes();
+}
+unsigned getSGPRSize(const TargetRegisterClass *RC,
+ const SIRegisterInfo *SIRI) {
+ if (!RC)
+ return 0;
+ if (!SIRI->isSGPRClass(RC))
+ return 0;
+ return RC->getLaneMask().getNumLanes();
+}
+
+void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet<SUnit *, 2> &backNodes,
+ unsigned NodeNum,
+ SmallDenseSet<SUnit *, 4> &visitedNodes) {
+ if (visitedNodes.count(SU))
+ return;
+ visitedNodes.insert(SU);
+
+ for (SDep &Dep : SU->Succs) {
+ if (Dep.isWeak())
+ continue;
+ if (Dep.getLatency() > 0)
+ continue;
+
+ SUnit *Succ = Dep.getSUnit(); /*
+ if (Succ->NodeNum >= NodeNum)
+ continue;*/
+
+ backNodes.insert(Succ);
+ collectSameHeightBackNodes(Succ, backNodes, NodeNum, visitedNodes);
+ }
+}
+
+} // namespace
+
+namespace llvm {
+
+void HRB::Lineage::addNode(llvm::SUnit *SU) { Nodes.emplace_back(SU); }
+unsigned HRB::Lineage::getSize() const {
+ return RC ? RC->getLaneMask().getNumLanes() : 0;
+}
+unsigned HRB::Lineage::length() const { return Nodes.size(); }
+
+SUnit *HRB::Lineage::getHead() const { return Nodes.front(); }
+SUnit *HRB::Lineage::getTail() const { return Nodes.back(); }
+
+void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
+ // Working list from TopRoots.
+ std::vector<SUnit *> workList = buildWorkList(SUnits);
+ IntEqClasses EqClasses(SUnits.size());
+
+ while (!workList.empty()) {
+ sortByHeight(workList);
+ // Highest SU.
+ SUnit *SU = workList.back();
+ workList.pop_back();
+ if (!SU->isInstr())
+ continue;
+ if (ChainedNodes.count(SU) > 0)
+ continue;
+ bRecomputeHeight = false;
+ Lineage lineage = buildChain(SU, SUnits);
+
+ // Remove chained nodes from worklist.
+ sortByInChain(workList, ChainedNodes);
+ while (!workList.empty()) {
+ SUnit *back = workList.back();
+ if (ChainedNodes.count(back))
+ workList.pop_back();
+ else
+ break;
+ }
+
+ Lineages.emplace_back(lineage);
+
+ if (bRecomputeHeight) {
+ // Update height from tail.
+ SUnit *tail = lineage.Nodes.back();
+ tail->setDepthDirty();
+ tail->getHeight();
+ }
+ }
+
+ DenseSet<SUnit *> tailSet;
+ for (Lineage &L : Lineages) {
+ if (L.Nodes.size() < 2)
+ continue;
+ auto it = L.Nodes.rbegin();
+ it++;
+ SUnit *tail = L.Nodes.back();
+ // If already as tail for other lineage, start from next.
+ if (tailSet.count(tail) > 0) {
+ tail = *it;
+ it++;
+ } else {
+ tailSet.insert(tail);
+ }
+ for (; it != L.Nodes.rend(); it++) {
+ SUnit *SU = *it;
+ if (tail->NodeNum == -1)
+ continue;
+ EqClasses.join(SU->NodeNum, tail->NodeNum);
+ }
+ }
+
+ EqClasses.compress();
+ // TODO: assign sub class to node.
+ for (Lineage &L : Lineages) {
+ for (SUnit *SU : L.Nodes) {
+ if (SU->NodeNum == -1)
+ continue;
+ unsigned SubIdx = EqClasses[SU->NodeNum];
+ //// Pack subidx.
+ // if (EqClasses.count(SubIdx) == 0)
+ // EqClasses[SubIdx] = EqClasses.size();
+ SubIdx = EqClasses[SubIdx];
+ // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag.
+ SU->NodeQueueId = SubIdx;
+ }
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "Chained Nodes:"; for (SUnit *SU
+ : ChainedNodes) {
+ dbgs() << " " << SU->NodeNum << "\n";
+ } for (int i = 0; i < Lineages.size(); i++) {
+ dbgs() << "Lineage" << i << ":";
+ Lineage &L = Lineages[i];
+ for (SUnit *SU : L.Nodes) {
+ dbgs() << " " << SU->NodeNum;
+ }
+ dbgs() << "\n";
+ });
+}
+
+SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
+ std::vector<SUnit *> Candidates;
+ for (SDep &Dep : SU->Succs) {
+ // Only check data dep.
+ if (Dep.getKind() != SDep::Data)
+ continue;
+
+ SUnit *Succ = Dep.getSUnit();
+ Candidates.emplace_back(Succ);
+ }
+
+ if (Candidates.empty())
+ return nullptr;
+
+ if (Candidates.size() == 1)
+ return Candidates.front();
+
+ sortByHeight(Candidates);
+ // Lowest height.
+ SUnit *Heir = Candidates.front();
+ SmallVector<SUnit *, 2> SameHeightCandidate;
+ for (SUnit *SU : Candidates) {
+ if (Heir->getHeight() != SU->getHeight())
+ break;
+ SameHeightCandidate.emplace_back(SU);
+ }
+ // Make sure choose lowest dependence between SameHeightCandidate.
+ if (SameHeightCandidate.size() > 1) {
+ for (int i = 1; i < SameHeightCandidate.size(); i++) {
+ SUnit *SU = SameHeightCandidate[i];
+ // If Heir is pred of SU, use SU.
+ if (canReach(SU, Heir))
+ Heir = SU;
+ }
+ }
+
+ unsigned HeriHeight = Heir->getHeight();
+
+ // if lowest node is in ChainedNodes, try to find same height nodes?
+
+ for (SDep &Dep : SU->Succs) {
+ // Only check data dep.
+ if (Dep.getKind() != SDep::Data)
+ continue;
+ SUnit *Succ = Dep.getSUnit();
+ if (Succ == Heir)
+ continue;
+ // Avoid cycle in DAG.
+ if (canReach(Heir, Succ))
+ return nullptr;
+ // Make sure Succ is before Heir.
+ Heir->addPred(SDep(Succ, SDep::Artificial));
+ updateReachForEdge(Succ, Heir, SUnits);
+ LLVM_DEBUG(dbgs() << "add edge from " << Succ->NodeNum << "("
+ << Succ->getHeight() << ") to " << Heir->NodeNum << "("
+ << HeriHeight << ")\n");
+ // Update height if need.
+ unsigned Height = Succ->getHeight();
+ if (Height <= HeriHeight) {
+ bRecomputeHeight = true;
+ }
+ }
+ return Heir;
+}
+
+HRB::Lineage HRB::buildChain(SUnit *Node,
+ std::vector<llvm::SUnit> &SUnits) {
+ HRB::Lineage chain;
+ chain.addNode(Node);
+ ChainedNodes.insert(Node);
+ LLVM_DEBUG(dbgs() << "start chain " << Node->NodeNum << "("
+ << Node->getHeight() << ")\n");
+ while (Node->NumSuccsLeft > 0) {
+ SUnit *Heir = findHeir(Node, SUnits);
+ if (!Heir)
+ break;
+ chain.addNode(Heir);
+
+ LLVM_DEBUG(dbgs() << "add node to chain " << Heir->NodeNum << "\n");
+ if (ChainedNodes.count(Heir) > 0)
+ break;
+ ChainedNodes.insert(Heir);
+
+ Node = Heir;
+ }
+ // Find biggest vgpr RC for the chain.
+ // TODO: Build conflict and allocate on each edge of the chain.
+ const TargetRegisterClass *RC = nullptr;
+ unsigned maxRCSize = 0;
+ for (SUnit *SU : chain.Nodes) {
+ const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI);
+ unsigned RCSize = getVGPRSize(SuRC, SIRI);
+ if (RCSize > maxRCSize) {
+ maxRCSize = RCSize;
+ RC = SuRC;
+ }
+ }
+ if (!RC) {
+ // TODO: Find biggest sgpr RC.
+ unsigned maxRCSize = 0;
+ for (SUnit *SU : chain.Nodes) {
+ const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI);
+ unsigned RCSize = getSGPRSize(SuRC, SIRI);
+ if (RCSize > maxRCSize) {
+ maxRCSize = RCSize;
+ RC = SuRC;
+ }
+ }
+ }
+ chain.RC = RC;
+ return chain;
+}
+
+void HRB::buildConflict() {
+
+ for (unsigned i = 0; i < Lineages.size(); i++) {
+ Lineage &a = Lineages[i];
+ for (unsigned j = i + 1; j < Lineages.size(); j++) {
+ Lineage &b = Lineages[j];
+ if (isConflict(a, b)) {
+ Color.Conflicts[i].insert(j);
+ Color.Conflicts[j].insert(i);
+ LLVM_DEBUG(dbgs() << i << " conflict" << j << "\n");
+ }
+ }
+ // SelfConflict.
+ Color.Conflicts[i].insert(i);
+ }
+}
+
+bool HRB::canReach(llvm::SUnit *a, llvm::SUnit *b) {
+ auto it = ReachMap.find(a);
+ // If no reach info, treat as reach.
+ if (it == ReachMap.end())
+ return true;
+ DenseSet<SUnit *> &CurReach = it->second;
+ return CurReach.find(b) != CurReach.end();
+}
+
+void HRB::updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b,
+ std::vector<llvm::SUnit> &SUnits) {
+ DenseSet<SUnit *> &ReachA = ReachMap[a];
+ ReachA.insert(b);
+ DenseSet<SUnit *> &ReachB = ReachMap[b];
+ ReachA.insert(ReachB.begin(), ReachB.end());
+
+ for (SUnit &SU : SUnits) {
+ if (!canReach(&SU, a))
+ continue;
+
+ DenseSet<SUnit *> &CurReach = ReachMap[&SU];
+ CurReach.insert(ReachA.begin(), ReachA.end());
+ }
+}
+
+void HRB::buildReachRelation(ArrayRef<SUnit *> BotRoots) {
+ // Add fake entry to do PostOrder traversal.
+ // SUnit using Pred to traversal, so need to Revrese post order.
+ SUnit FakeEntry;
+ SmallVector<SDep, 4> FakeDeps;
+ for (SUnit *Root : BotRoots) {
+ SDep Dep = SDep(Root, SDep::Artificial);
+ FakeEntry.addPred(Dep);
+ FakeDeps.emplace_back(Dep);
+ }
+
+ ReversePostOrderTraversal<SUnit *> RPOT(&FakeEntry);
+ for (SUnit *SU : RPOT) {
+ // Create Reach Set first.
+ ReachMap[SU].clear();
+ }
+ for (SUnit *SU : RPOT) {
+ DenseSet<SUnit *> &CurReach = ReachMap[SU];
+ // All Preds can reach SU and SU's reach.
+ for (SDep &Dep : SU->Preds) {
+ // Igonre week dep.
+ if (Dep.isWeak())
+ continue;
+ DenseSet<SUnit *> &PrevReach = ReachMap[Dep.getSUnit()];
+ PrevReach.insert(SU);
+ PrevReach.insert(CurReach.begin(), CurReach.end());
+ }
+ assert(CurReach.count(SU) == 0 && "dead loop");
+ }
+ // Remove fake entry.
+ for (SDep &Dep : FakeDeps) {
+ FakeEntry.removePred(Dep);
+ }
+ ReachMap.erase(&FakeEntry);
+
+ LLVM_DEBUG(for (Lineage &L
+ : Lineages) {
+ for (SUnit *SU : L.Nodes) {
+ DenseSet<SUnit *> &CurReach = ReachMap[SU];
+ dbgs() << SU->NodeNum << " reach: ";
+ for (SUnit *R : CurReach) {
+ dbgs() << R->NodeNum << " ";
+ }
+ dbgs() << "\n";
+ }
+ });
+}
+
+bool HRB::isConflict(const Lineage &a, const Lineage &b) {
+ // Make conflict between sgpr and vgpr to help group lineages when share
+ // colors. Keep the conflict will group lineages in avoid mix use color in
+ // different sub exp.
+ SUnit *head0 = a.getHead();
+ SUnit *tail0 = a.getTail();
+ SUnit *head1 = b.getHead();
+ SUnit *tail1 = b.getTail();
+ DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+ DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+ bool r01 = Reach0.count(tail1) != 0;
+ bool r10 = Reach1.count(tail0) != 0;
+ return r01 && r10;
+}
+bool HRB::canFuse(const Lineage &a, const Lineage &b) {
+ if (a.RC != b.RC) {
+ // no RC will not conflict with other nodes.
+ if (!a.RC)
+ return false;
+ if (!b.RC)
+ return false;
+ // SGRP and VGPR not conflict.
+ if (SIRI->isSGPRClass(a.RC) != SIRI->isSGPRClass(b.RC))
+ return false;
+ }
+ // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa.
+ SUnit *head0 = a.getHead();
+ SUnit *tail0 = a.getTail();
+ SUnit *head1 = b.getHead();
+ SUnit *tail1 = b.getTail();
+ DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+ DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+ bool r01 = Reach0.count(tail1) != 0;
+ bool r10 = Reach1.count(tail0) != 0;
+ return r01 != r10;
+}
+
+bool HRB::tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits) {
+
+ // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa.
+ SUnit *head0 = a.getHead();
+ SUnit *tail0 = a.getTail();
+ SUnit *head1 = b.getHead();
+ SUnit *tail1 = b.getTail();
+ DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+ DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+ bool r01 = Reach0.count(tail1) != 0;
+ bool r10 = Reach1.count(tail0) != 0;
+ if (r01 == r10)
+ return false;
+ Lineage *newHead = &a;
+ Lineage *newTail = &b;
+ if (r01) {
+ // a reach b, b cannot reach a.
+ // link a.tail->b.head.
+ newHead = &a;
+ newTail = &b;
+ } else {
+ // b reach a, a cannot reach b.
+ // link b.tail->a.head.
+ newHead = &b;
+ newTail = &a;
+ }
+
+ // Merge reg class.
+ const TargetRegisterClass *RC0 = newHead->RC;
+ const TargetRegisterClass *RC1 = newTail->RC;
+ unsigned RC0Size = getVGPRSize(RC0, SIRI);
+ unsigned RC1Size = getVGPRSize(RC1, SIRI);
+ if (RC1Size > RC0Size)
+ newHead->RC = RC1;
+ // Merge chain.
+ SUnit *fuseTail = newHead->getTail();
+ SUnit *fuseHead = newTail->getHead();
+ assert(ReachMap[fuseHead].count(fuseTail) == 0 && "");
+ fuseHead->addPred(SDep(fuseTail, SDep::Artificial));
+ LLVM_DEBUG(dbgs() << "fuse " << fuseTail->NodeNum << "->" << fuseHead->NodeNum
+ << "\n");
+ // Update reach map.
+ updateReachForEdge(fuseTail, fuseHead, SUnits);
+ // Merge Nodes.
+ newHead->Nodes.append(newTail->Nodes.begin(), newTail->Nodes.end());
+ // Clear newTail.
+ newTail->Nodes.clear();
+ newTail->RC = nullptr;
+ return true;
+}
+
+void HRB::fusionLineages(std::vector<llvm::SUnit> &SUnits) {
+ if (Lineages.empty())
+ return;
+ bool bUpdated = true;
+ while (bUpdated) {
+ bUpdated = false;
+ int size = Lineages.size();
+ for (int i = 0; i < size; i++) {
+ Lineage &a = Lineages[i];
+ if (a.length() == 0)
+ continue;
+
+ for (int j = i + 1; j < size; j++) {
+ Lineage &b = Lineages[j];
+ if (b.length() == 0)
+ continue;
+ if (tryFuse(a, b, SUnits)) {
+ bUpdated = true;
+ if (a.length() == 0)
+ break;
+ }
+ }
+ }
+ // Remove empty lineages.
+ std::sort(Lineages.begin(), Lineages.end(),
+ [](const Lineage &a, const Lineage &b) {
+ return a.length() > b.length();
+ });
+ while (Lineages.back().length() == 0) {
+ Lineages.pop_back();
+ }
+ }
+ // Set ID after fusion.
+ unsigned ID = 0;
+ for (Lineage &L : Lineages) {
+ L.ID = ID++;
+ }
+}
+
+unsigned HRB::colorLineages(std::vector<Lineage *> &lineages,
+ DenseMap<Lineage *, unsigned> &AllocMap,
+ const unsigned Limit) {
+ // allocate long Lineage first. How about size of RC?
+ std::sort(lineages.begin(), lineages.end(),
+ [](const Lineage *a, const Lineage *b) {
+ // Make sure root allocate first.
+ return a->length() > b->length();
+ });
+
+ unsigned maxColor = 0;
+ const unsigned VGPR_LIMIT = 256 * 4;
+
+ for (Lineage *L : lineages) {
+ unsigned ID = L->ID;
+ auto &Conflict = Color.Conflicts[ID];
+ std::bitset<VGPR_LIMIT> colors;
+ for (unsigned j : Conflict) {
+ Lineage *C = &Lineages[j];
+ if (AllocMap.count(C) == 0)
+ continue;
+ unsigned c = AllocMap[C];
+ unsigned s = C->getSize();
+ for (unsigned i = 0; i < s; i++) {
+ unsigned pos = c + i;
+ colors.set(pos);
+ }
+ }
+
+ unsigned color = Limit;
+ unsigned size = L->getSize();
+ for (unsigned i = 0; i < Limit - size;) {
+ unsigned oldI = i;
+ for (unsigned j = 0; j < size; j++) {
+ unsigned pos = i + size - 1 - j;
+ if (colors.test(pos)) {
+ i = pos + 1;
+ break;
+ }
+ }
+
+ if (i != oldI)
+ continue;
+ color = i;
+ break;
+ }
+
+ AllocMap[L] = color;
+ color += size;
+ if (color > maxColor)
+ maxColor = color;
+ }
+ return maxColor;
+}
+
+void HRB::ColorResult::colorSU(SUnit *SU, unsigned color) {
+ ColorMap[SU] = color;
+}
+
+unsigned HRB::ColorResult::getLineage(SUnit *SU) const {
+ return LineageMap.find(SU)->second;
+}
+
+bool HRB::ColorResult::isConflict(const SUnit *SU0, unsigned Lineage) const {
+ const unsigned L = LineageMap.find(SU0)->second;
+ const auto &Conflict = Conflicts.find(L)->second;
+ return Conflict.count(Lineage) > 0;
+}
+
+bool HRB::ColorResult::isHead(SUnit *SU) const { return HeadSet.count(SU); }
+bool HRB::ColorResult::isTail(SUnit *SU) const { return TailSet.count(SU); }
+
+const SUnit *HRB::ColorResult::getTail(SUnit *SU) const {
+ if (!isHead(SU))
+ return nullptr;
+ auto it = HeadTailMap.find(SU);
+ return it->second;
+}
+
+unsigned HRB::ColorResult::getColor(const llvm::SUnit *SU) const {
+ auto it = ColorMap.find(SU);
+ return it->second;
+}
+
+unsigned HRB::ColorResult::getSize(const llvm::SUnit *SU) const {
+ auto it = SizeMap.find(SU);
+ return it->second;
+}
+
+HRB::ColorResult &HRB::coloring() {
+ // Collect VGPR lineages.
+ std::vector<Lineage *> vgprLineages;
+ for (Lineage &L : Lineages) {
+ auto RC = L.RC;
+ if (!RC)
+ continue;
+ if (SIRI->isSGPRClass(RC))
+ continue;
+ vgprLineages.emplace_back(&L);
+ }
+
+ const unsigned VGPR_LIMIT = 256 * 4;
+ DenseMap<Lineage *, unsigned> VAllocMap;
+ const unsigned maxVGPR = colorLineages(vgprLineages, VAllocMap, VGPR_LIMIT);
+
+ // Collect SGPR lineages.
+ std::vector<Lineage *> sgprLineages;
+ for (Lineage &L : Lineages) {
+ auto RC = L.RC;
+ if (!RC)
+ continue;
+ if (!SIRI->isSGPRClass(RC))
+ continue;
+ sgprLineages.emplace_back(&L);
+ }
+
+ const unsigned SGPR_LIMIT = 104;
+ DenseMap<Lineage *, unsigned> SAllocMap;
+ const unsigned maxSGPR = colorLineages(sgprLineages, SAllocMap, SGPR_LIMIT);
+ // +1 for each type of lineages(SGPR, VGPR, no reg).
+ const unsigned maxReg = maxSGPR + 1 + maxVGPR + 1 + 1;
+ const unsigned sgprBase = maxVGPR + 1;
+
+ for (Lineage &L : Lineages) {
+ // Collect HeadSet.
+ Color.HeadSet.insert(L.getHead());
+ Color.TailSet.insert(L.getTail());
+ Color.HeadTailMap[L.getHead()] = L.getTail();
+ // Save color.
+ auto RC = L.RC;
+ // All no reg lineage goes to maxReg.
+ unsigned color = maxReg;
+ if (!RC) {
+ } else if (SIRI->isSGPRClass(RC)) {
+ color = SAllocMap[&L] + sgprBase;
+ } else {
+ color = VAllocMap[&L];
+ }
+ unsigned size = L.getSize();
+ for (SUnit *SU : L.Nodes) {
+ Color.colorSU(SU, color);
+ Color.SizeMap[SU] = size;
+ Color.LineageMap[SU] = L.ID;
+ }
+ }
+ Color.maxReg = maxReg;
+ Color.maxSGPR = maxSGPR;
+ Color.maxVGPR = maxVGPR;
+
+ for (unsigned i = 0; i < Lineages.size(); i++) {
+ Lineage &a = Lineages[i];
+ SUnit *headA = a.getHead();
+ unsigned colorA = Color.getColor(headA);
+ unsigned sizeA = Color.getSize(headA);
+ for (unsigned j = i + 1; j < Lineages.size(); j++) {
+ Lineage &b = Lineages[j];
+
+ SUnit *headB = b.getHead();
+ unsigned colorB = Color.getColor(headB);
+ unsigned sizeB = Color.getSize(headB);
+
+ if (colorB >= (colorA + sizeA))
+ continue;
+ if (colorA >= (colorB + sizeB))
+ continue;
+ Color.ShareColorLineages.insert(i);
+ Color.ShareColorLineages.insert(j);
+ }
+ }
+
+ return Color;
+}
+
+void HRB::dump() {
+ for (int i = 0; i < Lineages.size(); i++) {
+ dbgs() << "Lineage" << i << ":";
+ Lineage &L = Lineages[i];
+ for (SUnit *SU : L.Nodes) {
+ dbgs() << " " << SU->NodeNum;
+ }
+ dbgs() << "\n";
+ if (!Color.ColorMap.empty()) {
+ dbgs() << "color:" << Color.getColor(L.getHead())
+ << " size: " << Color.getSize(L.getHead()) << "\n";
+ }
+ if (!ReachMap.empty()) {
+ dbgs() << "conflict:";
+ for (int j = 0; j < Lineages.size(); j++) {
+ if (i == j)
+ continue;
+ if (isConflict(L, Lineages[j])) {
+ dbgs() << " " << j;
+ }
+ }
+ dbgs() << "\n";
+ }
+ }
+}
+
+void HRB::dumpReachMap() {
+ if (!ReachMap.empty()) {
+ dbgs() << "reachMap:";
+ for (auto it : ReachMap) {
+ SUnit *SU = it.first;
+ auto &Reach = it.second;
+ if (SU->isInstr()) {
+ MachineInstr *MI = SU->getInstr();
+ MI->print(dbgs());
+ }
+ dbgs() << SU->NodeNum << "can reach :\n";
+ for (SUnit *R : Reach) {
+ dbgs() << R->NodeNum << " ";
+ }
+ dbgs() << "\n";
+ }
+ dbgs() << "\n";
+ }
+}
+
+// schedule base on HRB lineages and color result.
+
+std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
+ std::vector<SUnit *> &BRoots,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI) {
+ HRB hrb(MRI, SIRI);
+ // build reach info to avoid dead loop when build linear.
+ hrb.buildReachRelation(BRoots);
+ hrb.buildLinear(SUnits);
+
+ std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *a, const SUnit *b) {
+ return a->NumSuccsLeft < b->NumSuccsLeft;
+ });
+ while (!BRoots.empty() && BRoots.back()->NumSuccsLeft > 0) {
+ BRoots.pop_back();
+ }
+
+ hrb.buildReachRelation(BRoots);
+ hrb.fusionLineages(SUnits);
+ hrb.buildConflict();
+ const HRB::ColorResult &Color = hrb.coloring();
+
+ LLVM_DEBUG(hrb.dump());
+
+ // All lineage head which don't has Pred is TopRoots.
+ // Put top roots in worklist.
+ // while worklist not empty.
+ // if not head or color avail
+ // is candidate.
+ // choose best candidate by height.
+ // update worklist.
+ std::vector<SUnit *> ReadyList;
+ for (SUnit &SU : SUnits) {
+ if (SU.NumPredsLeft == 0)
+ ReadyList.emplace_back(&SU); //.insert(&SU);
+ }
+ // When there're more than one sub exp in the DAG, make sure not mix different
+ // sub exp or it will dead loop for color goes different subexp.
+
+ std::bitset<512 * 2> colors;
+ auto isColorAvail = [&colors](unsigned color, unsigned size) -> bool {
+ for (unsigned i = 0; i < size; i++) {
+ unsigned pos = color + i;
+ if (colors.test(pos))
+ return false;
+ }
+ return true;
+ };
+ auto allocColor = [&colors](unsigned color, unsigned size) {
+ for (unsigned i = 0; i < size; i++) {
+ unsigned pos = color + i;
+ assert(!colors.test(pos) && "color already allocated");
+ LLVM_DEBUG(dbgs() << pos << "is allocated\n");
+ colors.set(pos);
+ }
+ };
+
+ auto freeColor = [&colors](unsigned color, unsigned size) {
+ for (unsigned i = 0; i < size; i++) {
+ unsigned pos = color + i;
+ assert(colors.test(pos) && "color has not been allocated");
+ LLVM_DEBUG(dbgs() << pos << "is free\n");
+ colors.reset(pos);
+ }
+ };
+
+ // Save color and size for tail to support case two lineage share tail.
+ // When finish a tail, free color for working lineage which end with tail.
+ DenseMap<const SUnit *,
+ SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2>>
+ TailMap;
+
+ // For lineages share same color, need to choose correct order.
+ // If l0 has color 0, l1 has color 1, l2 has color 0, l3 has color 1.
+ // l0 and l3 conflict, l1 and l2 conflict.
+ // l0 and l3 must sched together.
+ // If sched l0 and l1, it may dead lock for l0 wait something in l3 and l1
+ // wait something in l2.
+ // ShareColorLineages will mark lineages which share color with other
+ // lineages. When sched, choose new lineages which has more conflict with
+ // ShareColorLineages.
+ const DenseSet<unsigned> &ShareColorLineages = Color.ShareColorLineages;
+
+ std::vector<const SUnit *> Schedule;
+ DenseSet<unsigned> UnfinishedLineages;
+ while (!ReadyList.empty()) {
+ // Make sure node conflict with predLineage first.
+ std::sort(ReadyList.begin(), ReadyList.end(),
+ [&UnfinishedLineages, &Color](const SUnit *a, const SUnit *b) {
+ unsigned confA = 0;
+ for (unsigned L : UnfinishedLineages) {
+ if (Color.isConflict(a, L))
+ confA++;
+ }
+ unsigned confB = 0;
+ for (unsigned L : UnfinishedLineages) {
+ if (Color.isConflict(b, L))
+ confB++;
+ }
+ return confA > confB;
+ });
+
+ LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU
+ : ReadyList) {
+ dbgs() << " " << SU->NodeNum;
+ } dbgs() << "\n";);
+ SUnit *Candidate = nullptr;
+ for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) {
+ SUnit *SU = *it;
+ unsigned color = Color.getColor(SU);
+ unsigned size = Color.getSize(SU);
+ // If SU is not head or color is available, SU is the candidate.
+ if (Color.isHead(SU)) {
+ if (!isColorAvail(color, size))
+ continue;
+ // alloc color.
+ allocColor(color, size);
+ // save tail color.
+ const SUnit *Tail = Color.getTail(SU);
+ unsigned ID = Color.getLineage(SU);
+ SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2> &tailColors =
+ TailMap[Tail];
+ tailColors.emplace_back(std::make_tuple(color, size, ID));
+ if (ShareColorLineages.count(ID))
+ UnfinishedLineages.insert(ID);
+ }
+
+ // free color for working lineage which end with SU.
+ if (Color.isTail(SU)) {
+ auto &tailColors = TailMap[SU];
+ for (auto &tailTuple : tailColors) {
+ unsigned lineageColor, lineageSize, ID;
+ std::tie(lineageColor, lineageSize, ID) = tailTuple;
+ freeColor(lineageColor, lineageSize);
+ if (ShareColorLineages.count(ID))
+ UnfinishedLineages.insert(ID);
+ }
+ // Clear the tail.
+ TailMap.erase(SU);
+ }
+
+ Candidate = SU;
+ // Remove Candidate from ReadyList.
+ ReadyList.erase(it);
+ break;
+ }
+
+ if (!Candidate) {
+ // In case failed to find candidate, start a lineage if there is one.
+ for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) {
+ SUnit *SU = *it;
+
+ if (!Color.isHead(SU)) {
+ continue;
+ }
+ Candidate = SU;
+ // Remove Candidate from ReadyList.
+ ReadyList.erase(it);
+ break;
+ }
+ }
+ assert(Candidate && "fail to find a Candidate");
+ LLVM_DEBUG(dbgs() << "Sched " << Candidate->NodeNum << "\n");
+
+ // Add all Candidate succ which is Ready.
+ for (SDep &Dep : Candidate->Succs) {
+ if (Dep.isWeak())
+ continue;
+ SUnit *Succ = Dep.getSUnit();
+
+ if (Succ->NumPredsLeft > 0)
+ Succ->NumPredsLeft--;
+ LLVM_DEBUG(dbgs() << "Succ " << Succ->NodeNum << " has "
+ << Succ->NumPredsLeft << " preds\n");
+ if (Succ->NumPredsLeft == 0)
+ ReadyList.emplace_back(Succ);
+ }
+
+ // Sched Candidate.
+ assert(Candidate->isInstr() && "Candidate must be instr Node");
+ Schedule.emplace_back(Candidate);
+ }
+ assert(Schedule.size() == SUnits.size() && "SUnit size should match");
+ return Schedule;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
new file mode 100644
index 000000000000000..c234f3237079353
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -0,0 +1,197 @@
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/MC/LaneBitmask.h"
+
+#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
+
+namespace llvm {
+class MachineFunction;
+class LiveIntervals;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class SIInstrInfo;
+class MachineInstr;
+class MachineBasicBlock;
+template<typename GraphType>
+class GraphWriter;
+class SUnit;
+class IntEqClasses;
+class Twine;
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+
+// SubExp and BlockExpDag.
+struct SubExp {
+ // Keep original order for sunits.
+ std::vector<llvm::MachineInstr *> SUnits;
+ llvm::DenseSet<unsigned> TopRegs;
+ llvm::DenseSet<llvm::MachineInstr *> BottomRoots;
+ llvm::DenseSet<unsigned> BottomRegs;
+ bool bMultiDefOutput = false;
+ bool bHasTerminatorInst = false;
+ bool bUseIncomingReg = false;
+ bool bMoveIntoLoop = false;
+ bool bNotSafeToCopy = false;
+ bool bHasMemInst = false;
+ bool bHoist = false;
+ // If temp/out reg is used by inst not in the subExp, cannot move since not
+ // all users will be move. But OK to clone.
+ bool bCloneOnly = false;
+ bool bTouchSCC = false;
+ llvm::MachineBasicBlock *FromBB;
+ llvm::MachineBasicBlock *ToBB;
+ unsigned sInputSize;
+ unsigned vInputSize;
+ unsigned sOutputSize;
+ unsigned vOutputSize;
+ unsigned sMaxSize;
+ unsigned vMaxSize;
+ LiveSet inputLive;
+ LiveSet outputLive;
+ bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool bMoveUp) const;
+ void calcMaxPressure(const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI);
+ void dump(const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI) const;
+ bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo* SIRI) const;
+};
+
+struct ExpDag {
+ ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
+ const llvm::SIInstrInfo *SIII,
+ const bool bJoinInput);
+ const llvm::MachineRegisterInfo &MRI;
+ const llvm::SIRegisterInfo *SIRI;
+ const llvm::SIInstrInfo *SIII;
+ const bool bJoinInputToSubExp;
+
+ std::vector<llvm::SUnit> SUnits; ///< The scheduling units.
+ llvm::DenseMap<llvm::MachineInstr *, llvm::SUnit *> MISUnitMap;
+ llvm::DenseMap<llvm::SUnit *, llvm::MachineInstr *> SUnitMIMap;
+ llvm::DenseMap<unsigned, llvm::SUnit *> InputSUnitMap;
+ llvm::DenseMap<llvm::SUnit *, unsigned> SUnitInputMap;
+ std::vector<SubExp> SubExps;
+ template <typename T>
+ void build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+ T &insts);
+ void dump();
+ void viewGraph(const llvm::Twine &Name, const llvm::Twine &Title) const;
+ /// Returns a label for an SUnit node in a visualization of the ScheduleDAG.
+ std::string getGraphNodeLabel(const llvm::SUnit *SU) const;
+ std::string getDAGName() const;
+ /// Adds custom features for a visualization of the ScheduleDAG.
+ void addCustomGraphFeatures(llvm::GraphWriter<ExpDag *> &) const {}
+private:
+ template<typename T>
+ void initNodes(const LiveSet &InputLiveReg, T &insts);
+ void addDataDep(const llvm::SIRegisterInfo *SIRI);
+ void addCtrlDep();
+ void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
+ const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+};
+
+struct BlockExpDag : public ExpDag {
+ BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+ llvm::LiveIntervals *LIS;
+ llvm::MachineBasicBlock *MBB;
+ llvm::DenseMap<llvm::SUnit *, LiveSet> DagPressureMap;
+ std::vector<std::vector<llvm::SUnit *>> SUnitsInSameDepth;
+ std::vector<SubExp> SubExps;
+ void build();
+ void buildWithPressure();
+private:
+ void buildAvail(const LiveSet &passThruSet,
+ llvm::DenseMap<llvm::SUnit *, LiveSet> &DagAvailRegMap);
+ void buildPressure(const LiveSet &StartLiveReg,
+ const LiveSet &EndLiveReg);
+};
+
+void getRegBound(llvm::MachineBasicBlock *MBB,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI,
+ const llvm::SIInstrInfo *SIII, llvm::LiveIntervals *LIS,
+ unsigned &MaxVGPR, unsigned &MaxSGRP);
+
+// Currently mix sgpr and vgpr when build lineage to avoid cycle.
+// This maybe waste registers.
+// Based on "Minimum Register Instruction Sequencing to Reduce Register Spills
+// in Out-of-Order Issue Superscalar Architectures".
+class HRB {
+public:
+ struct Lineage {
+ unsigned ID = 0;
+ const llvm::TargetRegisterClass *RC = nullptr;
+ llvm::SmallVector<llvm::SUnit *, 4> Nodes;
+ llvm::SUnit *getHead() const;
+ llvm::SUnit *getTail() const;
+ void addNode(llvm::SUnit *);
+ unsigned getSize() const;
+ unsigned length() const;
+ };
+ struct ColorResult {
+ llvm::DenseMap<llvm::SUnit *, unsigned> ColorMap;
+ llvm::DenseMap<llvm::SUnit *, unsigned> SizeMap;
+ llvm::DenseMap<llvm::SUnit *, unsigned> LineageMap;
+ llvm::DenseMap<unsigned, llvm::DenseSet<unsigned>> Conflicts;
+ llvm::DenseSet<unsigned> ShareColorLineages;
+ llvm::DenseSet<llvm::SUnit *> HeadSet;
+ llvm::DenseSet<llvm::SUnit *> TailSet;
+ llvm::DenseMap<llvm::SUnit *, llvm::SUnit *> HeadTailMap;
+ unsigned maxReg = 0;
+ unsigned maxVGPR = 0;
+ unsigned maxSGPR = 0;
+ void colorSU(llvm::SUnit *SU, unsigned color);
+ unsigned getLineage(llvm::SUnit *SU) const;
+ bool isConflict(const llvm::SUnit *SU0, unsigned Lineage) const;
+ bool isHead(llvm::SUnit *SU) const;
+ bool isTail(llvm::SUnit *SU) const;
+ const llvm::SUnit *getTail(llvm::SUnit *SU) const;
+ unsigned getColor(const llvm::SUnit *SU) const;
+ unsigned getSize(const llvm::SUnit *SU) const;
+ };
+ HRB(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI)
+ : MRI(MRI), SIRI(SIRI) {}
+
+ void buildLinear(std::vector<llvm::SUnit> &SUnits);
+ void buildConflict();
+ void buildReachRelation(llvm::ArrayRef<llvm::SUnit *> BotRoots);
+ llvm::DenseMap<llvm::SUnit *, llvm::DenseSet<llvm::SUnit *>> &getReachMap() {
+ return ReachMap;
+ }
+ bool canReach(llvm::SUnit *a, llvm::SUnit *b);
+ void updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b,
+ std::vector<llvm::SUnit> &SUnits);
+ void fusionLineages(std::vector<llvm::SUnit> &SUnits);
+ ColorResult &coloring();
+ void dump();
+ void dumpReachMap();
+
+private:
+ Lineage buildChain(llvm::SUnit *Node, std::vector<llvm::SUnit> &SUnits);
+ llvm::SUnit *findHeir(llvm::SUnit *SU, std::vector<llvm::SUnit> &SUnits);
+ bool isConflict(const Lineage &a, const Lineage &b);
+ bool canFuse(const Lineage &a, const Lineage &b);
+ bool tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits);
+ unsigned colorLineages(std::vector<Lineage *> &lineages,
+ llvm::DenseMap<Lineage *, unsigned> &AllocMap,
+ const unsigned Limit);
+
+ llvm::DenseSet<llvm::SUnit *> ChainedNodes;
+ llvm::DenseMap<llvm::SUnit *, llvm::DenseSet<llvm::SUnit *>> ReachMap;
+ bool bRecomputeHeight = false;
+ std::vector<Lineage> Lineages;
+ ColorResult Color;
+ const llvm::MachineRegisterInfo &MRI;
+ const llvm::SIRegisterInfo *SIRI;
+};
+
+std::vector<const llvm::SUnit *> hrbSched(std::vector<llvm::SUnit> &SUnits,
+ std::vector<llvm::SUnit *> &BRoots,
+ const llvm::MachineRegisterInfo &MRI,
+ const llvm::SIRegisterInfo *SIRI);
+
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 96062b30fc0127a..b88673d94a19157 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -507,6 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
+ initializeAMDGPUHotBlockRematerializePass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
new file mode 100644
index 000000000000000..c9172bae2cb4ad7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
@@ -0,0 +1,106 @@
+//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Build degree about VMem to help balance latency and pressure inside a
+/// block.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <vector>
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
+
+namespace llvm {
+class MachineBasicBlock;
+class SUnit;
+class SIInstrInfo;
+class MachineInstr;
+
+class SimpleDAG {
+public:
+ SimpleDAG(llvm::MachineBasicBlock &MBB, const llvm::SIInstrInfo *TII)
+ : SIII(TII), MBB(MBB) {}
+ std::vector<llvm::SUnit> SUnits;
+ // InstrInfo.
+ const llvm::SIInstrInfo *SIII;
+ llvm::DenseMap<llvm::MachineInstr *, llvm::SUnit *> MISUnitMap;
+ llvm::DenseMap<llvm::SUnit *, llvm::MachineInstr *> SUnitMIMap;
+ llvm::MachineBasicBlock &MBB;
+ void build();
+
+private:
+ void initNodes();
+ void addDependence();
+ void addCtrlDep();
+};
+
+
+// Collect height/depth for high latency mem ld, which only update height/depth
+// when cross high latency mem ld. Call the height/depth as VMem degree here.
+// The rule is sample and its user should has different degree.
+// For example
+// a = sample // a has depth 0, height 3
+// b = sample a // b has depth 1, height 2
+// c = sample c // c has depth 2, height 1
+// user of c // user of c has depth 2, height 0
+//
+// For the purpose of in block reorder/remat, nothing will move/clone cross the
+// block. So do this after cross blk remat? In the middle of cross block remat
+// to help reach target when only move things cross blk cannot reach the target.
+// Reorder at the beginning? No pressure at that time? After get pressure, might
+// need to update max pressure.
+
+class VMemDegreeDAG {
+public:
+ VMemDegreeDAG(std::vector<llvm::SUnit> &Units,
+ const llvm::SIInstrInfo *TII)
+ : SUnits(Units), SIII(TII) {}
+ std::vector<llvm::SUnit> &SUnits;
+ // InstrInfo.
+ const llvm::SIInstrInfo *SIII;
+ void build();
+
+
+ bool isHighLatency(const llvm::SUnit *SU) const;
+ bool isHighLatency(const llvm::MachineInstr *MI) const;
+ // height/depth based on Long latency inst.
+ std::vector<unsigned> VMemDataHeight;
+ std::vector<unsigned> VMemDataDepth;
+ // Full height/depth count non-data dependent too.
+ std::vector<unsigned> VMemFullHeight;
+ std::vector<unsigned> VMemFullDepth;
+ llvm::SmallVector<llvm::SUnit *, 16> VMemSUs;
+ llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUs;
+ llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUsByDepth;
+
+
+ void dump();
+
+private:
+ static constexpr unsigned kNoReg = -1;
+
+
+ std::pair<unsigned, unsigned> buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
+ std::vector<unsigned> &VMemDepth, bool bDataOnly);
+ // Compute vmem height/depth.
+ void buildVMemDepthHeight();
+ void buildVMemDataDepthHeight();
+ void groupVmemSUnits();
+
+};
+
+
+
+// Split block based on vmem depth.
+void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag);
+
+}
+
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 408da0536237edc..92a9b3b3748ca6f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelDivergenceLowering.cpp
AMDGPUGlobalISelUtils.cpp
+ AMDGPUHotBlockRematerialize.cpp
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
@@ -80,10 +81,14 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUMemoryUtils.cpp
+ AMDGPUMIRUtils.cpp
+ AMDGPUMirDivergenceAnalysis.cpp
+ AMDGPUMirSyncDependenceAnalysis.cpp
AMDGPUIGroupLP.cpp
AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
+ AMDGPUOccupancyAndLatencyHelper.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
@@ -106,6 +111,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUSelectionDAGInfo.cpp
AMDGPUSetWavePriority.cpp
AMDGPUSplitModule.cpp
+ AMDGPUSubExpDag.cpp
AMDGPUSubtarget.cpp
AMDGPUTargetMachine.cpp
AMDGPUTargetObjectFile.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 7554b9f578fcbb1..aa4b3f948b726f3 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -47,6 +47,10 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
+ unsigned getMaxSGPR() const {
+ return std::max(getSGPRNum(), getSGPRTuplesWeight());
+ }
+
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR32]; }
/// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 933935a86f9f98f..cb10df2c3412906 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1313,6 +1313,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool isLowLatencyInstruction(const MachineInstr &MI) const;
bool isHighLatencyDef(int Opc) const override;
+ bool isHighLatencyInstruction(const MachineInstr& MI) const {
+ return isHighLatencyDef(MI.getOpcode());
+ }
/// Return the descriptor of the target-specific machine instruction
/// that corresponds to the specified pseudo or native opcode.
diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
new file mode 100644
index 000000000000000..e8a66b47ac732b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
@@ -0,0 +1,405 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-hot-block-remat-aggressive -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+
+# DEFS
+# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni00:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div00]], implicit $exec
+# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni01:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div01]], implicit $exec
+# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni02:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div02]], implicit $exec
+# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni03:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div03]], implicit $exec
+# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni04:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div04]], implicit $exec
+# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni05:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div05]], implicit $exec
+# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni06:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div06]], implicit $exec
+# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni07:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div07]], implicit $exec
+# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni08:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div08]], implicit $exec
+# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni09:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div09]], implicit $exec
+# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni10:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div10]], implicit $exec
+# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni11:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div11]], implicit $exec
+# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni12:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div12]], implicit $exec
+# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni13:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div13]], implicit $exec
+# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni14:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div14]], implicit $exec
+# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni15:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div15]], implicit $exec
+# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni16:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div16]], implicit $exec
+# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni17:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div17]], implicit $exec
+# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni18:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div18]], implicit $exec
+# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni19:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div19]], implicit $exec
+# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni20:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div20]], implicit $exec
+# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni21:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div21]], implicit $exec
+# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni22:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div22]], implicit $exec
+# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni23:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div23]], implicit $exec
+# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni24:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div24]], implicit $exec
+# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni25:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div25]], implicit $exec
+# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni26:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div26]], implicit $exec
+# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni27:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div27]], implicit $exec
+# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni28:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div28]], implicit $exec
+# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni29:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div29]], implicit $exec
+# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni30:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div30]], implicit $exec
+# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni31:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div31]], implicit $exec
+# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni32:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div32]], implicit $exec
+# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni33:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div33]], implicit $exec
+# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni34:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div34]], implicit $exec
+# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni35:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div35]], implicit $exec
+# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni36:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div36]], implicit $exec
+# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni37:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div37]], implicit $exec
+# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni38:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div38]], implicit $exec
+# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni39:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div39]], implicit $exec
+# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni40:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div40]], implicit $exec
+# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni41:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div41]], implicit $exec
+# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni42:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div42]], implicit $exec
+# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni43:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div43]], implicit $exec
+# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni44:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div44]], implicit $exec
+# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni45:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div45]], implicit $exec
+# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni46:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div46]], implicit $exec
+# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni47:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div47]], implicit $exec
+# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni48:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div48]], implicit $exec
+# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni49:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div49]], implicit $exec
+# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni50:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div50]], implicit $exec
+# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni51:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div51]], implicit $exec
+# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni52:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div52]], implicit $exec
+# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni53:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div53]], implicit $exec
+# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni54:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div54]], implicit $exec
+# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni55:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div55]], implicit $exec
+# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni56:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div56]], implicit $exec
+# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni57:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div57]], implicit $exec
+# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni58:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div58]], implicit $exec
+# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni59:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div59]], implicit $exec
+
+
+# USERS:
+# CHECK: %[[#div_00:]]:vgpr_32 = COPY %[[#uni00]]
+#CHECK: EXP 0, %[[#div_00]],
+# CHECK: %[[#div_01:]]:vgpr_32 = COPY %[[#uni01]]
+#CHECK: EXP 0, %[[#div_01]],
+# CHECK: %[[#div_02:]]:vgpr_32 = COPY %[[#uni02]]
+#CHECK: EXP 0, %[[#div_02]],
+# CHECK: %[[#div_03:]]:vgpr_32 = COPY %[[#uni03]]
+#CHECK: EXP 0, %[[#div_03]],
+# CHECK: %[[#div_04:]]:vgpr_32 = COPY %[[#uni04]]
+#CHECK: EXP 0, %[[#div_04]],
+# CHECK: %[[#div_05:]]:vgpr_32 = COPY %[[#uni05]]
+#CHECK: EXP 0, %[[#div_05]],
+# CHECK: %[[#div_06:]]:vgpr_32 = COPY %[[#uni06]]
+#CHECK: EXP 0, %[[#div_06]],
+# CHECK: %[[#div_07:]]:vgpr_32 = COPY %[[#uni07]]
+#CHECK: EXP 0, %[[#div_07]],
+# CHECK: %[[#div_08:]]:vgpr_32 = COPY %[[#uni08]]
+#CHECK: EXP 0, %[[#div_08]],
+# CHECK: %[[#div_09:]]:vgpr_32 = COPY %[[#uni09]]
+#CHECK: EXP 0, %[[#div_09]],
+# CHECK: %[[#div_10:]]:vgpr_32 = COPY %[[#uni10]]
+#CHECK: EXP 0, %[[#div_10]],
+# CHECK: %[[#div_11:]]:vgpr_32 = COPY %[[#uni11]]
+#CHECK: EXP 0, %[[#div_11]],
+# CHECK: %[[#div_12:]]:vgpr_32 = COPY %[[#uni12]]
+#CHECK: EXP 0, %[[#div_12]],
+# CHECK: %[[#div_13:]]:vgpr_32 = COPY %[[#uni13]]
+#CHECK: EXP 0, %[[#div_13]],
+# CHECK: %[[#div_14:]]:vgpr_32 = COPY %[[#uni14]]
+#CHECK: EXP 0, %[[#div_14]],
+# CHECK: %[[#div_15:]]:vgpr_32 = COPY %[[#uni15]]
+#CHECK: EXP 0, %[[#div_15]],
+# CHECK: %[[#div_16:]]:vgpr_32 = COPY %[[#uni16]]
+#CHECK: EXP 0, %[[#div_16]],
+# CHECK: %[[#div_17:]]:vgpr_32 = COPY %[[#uni17]]
+#CHECK: EXP 0, %[[#div_17]],
+# CHECK: %[[#div_18:]]:vgpr_32 = COPY %[[#uni18]]
+#CHECK: EXP 0, %[[#div_18]],
+# CHECK: %[[#div_19:]]:vgpr_32 = COPY %[[#uni19]]
+#CHECK: EXP 0, %[[#div_19]],
+# CHECK: %[[#div_20:]]:vgpr_32 = COPY %[[#uni20]]
+#CHECK: EXP 0, %[[#div_20]],
+# CHECK: %[[#div_21:]]:vgpr_32 = COPY %[[#uni21]]
+#CHECK: EXP 0, %[[#div_21]],
+# CHECK: %[[#div_22:]]:vgpr_32 = COPY %[[#uni22]]
+#CHECK: EXP 0, %[[#div_22]],
+# CHECK: %[[#div_23:]]:vgpr_32 = COPY %[[#uni23]]
+#CHECK: EXP 0, %[[#div_23]],
+# CHECK: %[[#div_24:]]:vgpr_32 = COPY %[[#uni24]]
+#CHECK: EXP 0, %[[#div_24]],
+# CHECK: %[[#div_25:]]:vgpr_32 = COPY %[[#uni25]]
+#CHECK: EXP 0, %[[#div_25]],
+# CHECK: %[[#div_26:]]:vgpr_32 = COPY %[[#uni26]]
+#CHECK: EXP 0, %[[#div_26]],
+# CHECK: %[[#div_27:]]:vgpr_32 = COPY %[[#uni27]]
+#CHECK: EXP 0, %[[#div_27]],
+# CHECK: %[[#div_28:]]:vgpr_32 = COPY %[[#uni28]]
+#CHECK: EXP 0, %[[#div_28]],
+# CHECK: %[[#div_29:]]:vgpr_32 = COPY %[[#uni29]]
+#CHECK: EXP 0, %[[#div_29]],
+# CHECK: %[[#div_30:]]:vgpr_32 = COPY %[[#uni30]]
+#CHECK: EXP 0, %[[#div_30]],
+# CHECK: %[[#div_31:]]:vgpr_32 = COPY %[[#uni31]]
+#CHECK: EXP 0, %[[#div_31]],
+# CHECK: %[[#div_32:]]:vgpr_32 = COPY %[[#uni32]]
+#CHECK: EXP 0, %[[#div_32]],
+# CHECK: %[[#div_33:]]:vgpr_32 = COPY %[[#uni33]]
+#CHECK: EXP 0, %[[#div_33]],
+# CHECK: %[[#div_34:]]:vgpr_32 = COPY %[[#uni34]]
+#CHECK: EXP 0, %[[#div_34]],
+# CHECK: %[[#div_35:]]:vgpr_32 = COPY %[[#uni35]]
+#CHECK: EXP 0, %[[#div_35]],
+# CHECK: %[[#div_36:]]:vgpr_32 = COPY %[[#uni36]]
+#CHECK: EXP 0, %[[#div_36]],
+# CHECK: %[[#div_37:]]:vgpr_32 = COPY %[[#uni37]]
+#CHECK: EXP 0, %[[#div_37]],
+# CHECK: %[[#div_38:]]:vgpr_32 = COPY %[[#uni38]]
+#CHECK: EXP 0, %[[#div_38]],
+# CHECK: %[[#div_39:]]:vgpr_32 = COPY %[[#uni39]]
+#CHECK: EXP 0, %[[#div_39]],
+# CHECK: %[[#div_40:]]:vgpr_32 = COPY %[[#uni40]]
+#CHECK: EXP 0, %[[#div_40]],
+# CHECK: %[[#div_41:]]:vgpr_32 = COPY %[[#uni41]]
+#CHECK: EXP 0, %[[#div_41]],
+# CHECK: %[[#div_42:]]:vgpr_32 = COPY %[[#uni42]]
+#CHECK: EXP 0, %[[#div_42]],
+# CHECK: %[[#div_43:]]:vgpr_32 = COPY %[[#uni43]]
+#CHECK: EXP 0, %[[#div_43]],
+# CHECK: %[[#div_44:]]:vgpr_32 = COPY %[[#uni44]]
+#CHECK: EXP 0, %[[#div_44]],
+# CHECK: %[[#div_45:]]:vgpr_32 = COPY %[[#uni45]]
+#CHECK: EXP 0, %[[#div_45]],
+# CHECK: %[[#div_46:]]:vgpr_32 = COPY %[[#uni46]]
+#CHECK: EXP 0, %[[#div_46]],
+# CHECK: %[[#div_47:]]:vgpr_32 = COPY %[[#uni47]]
+#CHECK: EXP 0, %[[#div_47]],
+# CHECK: %[[#div_48:]]:vgpr_32 = COPY %[[#uni48]]
+#CHECK: EXP 0, %[[#div_48]],
+# CHECK: %[[#div_49:]]:vgpr_32 = COPY %[[#uni49]]
+#CHECK: EXP 0, %[[#div_49]],
+# CHECK: %[[#div_50:]]:vgpr_32 = COPY %[[#uni50]]
+#CHECK: EXP 0, %[[#div_50]],
+# CHECK: %[[#div_51:]]:vgpr_32 = COPY %[[#uni51]]
+#CHECK: EXP 0, %[[#div_51]],
+# CHECK: %[[#div_52:]]:vgpr_32 = COPY %[[#uni52]]
+#CHECK: EXP 0, %[[#div_52]],
+# CHECK: %[[#div_53:]]:vgpr_32 = COPY %[[#uni53]]
+#CHECK: EXP 0, %[[#div_53]],
+# CHECK: %[[#div_54:]]:vgpr_32 = COPY %[[#uni54]]
+#CHECK: EXP 0, %[[#div_54]],
+# CHECK: %[[#div_55:]]:vgpr_32 = COPY %[[#uni55]]
+#CHECK: EXP 0, %[[#div_55]],
+# CHECK: %[[#div_56:]]:vgpr_32 = COPY %[[#uni56]]
+#CHECK: EXP 0, %[[#div_56]],
+# CHECK: %[[#div_57:]]:vgpr_32 = COPY %[[#uni57]]
+#CHECK: EXP 0, %[[#div_57]],
+# CHECK: %[[#div_58:]]:vgpr_32 = COPY %[[#uni58]]
+#CHECK: EXP 0, %[[#div_58]],
+# CHECK: %[[#div_59:]]:vgpr_32 = COPY %[[#uni59]]
+#CHECK: EXP 0, %[[#div_59]],
+
+
+--- |
+ source_filename = ".\main.ll"
+ define amdgpu_ps void @main() #1 {
+ ret void
+ }
+ attributes #1 = { "target-cpu"="gfx1010" }
+ !llvm.ident = !{!0}
+ !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name: main
+tracksRegLiveness: true
+liveins:
+ - { reg: '$sgpr0' }
+ - { reg: '$sgpr1' }
+ - { reg: '$sgpr8' }
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $sgpr1, $sgpr8, $vgpr0, $vgpr1
+
+ %1000:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1001:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1002:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1003:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1004:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1005:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1006:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1007:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1008:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1009:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %1059, 0, implicit $exec, implicit $mode
+ $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+ %99:vgpr_32 = COPY %1058
+ S_BRANCH %bb.2
+
+ bb.2:
+ %1:vgpr_32 = IMPLICIT_DEF
+ EXP 0, killed %1000, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1001, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1002, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1003, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1004, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1005, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1006, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1007, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1008, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1009, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1010, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1011, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1012, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1013, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1014, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1015, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1016, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1017, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1018, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1019, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1020, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1021, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1022, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1023, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1024, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1025, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1026, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1027, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1028, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1029, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1030, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1031, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1032, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1033, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1034, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1035, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1036, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1037, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1038, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1039, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1040, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1041, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1042, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1043, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1044, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1045, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1046, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1047, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1048, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1049, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1050, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1051, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1052, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1053, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1054, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1055, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1056, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1057, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1058, %1, %1, %1, -1, -1, 15, implicit $exec
+ EXP 0, killed %1059, %1, %1, %1, -1, -1, 15, implicit $exec
+ S_ENDPGM 0
+...
>From 280571f2da195c1bd53e47c6f676999214233a80 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Thu, 6 Feb 2025 13:52:02 -0800
Subject: [PATCH 2/3] Fixed build, and added simple tests that exercise major
code paths
---
.../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 8 +-
.../test/CodeGen/AMDGPU/remat/group_remat.mir | 507 ++++++++++++++
.../AMDGPU/remat/group_remat_with_uses.mir | 641 ++++++++++++++++++
.../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 450 ++++++++++++
4 files changed, 1603 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 44ebaa2d51bec19..8647185bf5d51b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -291,7 +291,7 @@ unsigned CollectFnPressure(
MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
RematStatus &status) {
- unsigned TgtOcc = ST->getOccupancyWithLocalMemSize(MF);
+ unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
// If only have one block, input/ouput virtual live set are empty.
if (MF.size() > 1) {
// Build input output live reg first.
@@ -1351,7 +1351,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
// If bound by lds, skip.
- if (status.TargetOcc > ST->getOccupancyWithLocalMemSize(MF) &&
+ if (status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
!bForceRematSgpr)
return false;
@@ -1663,6 +1663,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
Register OpReg = Op.getReg();
if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
continue;
+ if (IsImplicitUseOfReg(Op, AMDGPU::MODE))
+ continue;
if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI))
continue;
// Alow unused scc define.
@@ -4454,7 +4456,7 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
}
// If bound by lds, skip.
- if ((status.TargetOcc + 1) > ST->getOccupancyWithLocalMemSize(MF) &&
+ if ((status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second &&
!bSGPRSpill)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
new file mode 100644
index 000000000000000..7f3483c66a5d9b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
@@ -0,0 +1,507 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+
+# Check that the whole expression gets moved to uses in bb.2.
+# CHECK: bb.0:
+# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0
+# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1
+# CHECK: bb.1:
+# CHECK: bb.2:
+# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]]
+# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]]
+# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]]
+# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]]
+# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]]
+# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]]
+# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]]
+# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]]
+# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]]
+# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]]
+# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]]
+# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]]
+# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]]
+# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]]
+# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]]
+# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]]
+# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]]
+# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]]
+# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]]
+# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]]
+# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]]
+# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]]
+# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]]
+# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]]
+# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]]
+# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]]
+# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]]
+# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]]
+# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]]
+# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]]
+# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]]
+# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]]
+# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]]
+# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]]
+# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]]
+# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]]
+# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]]
+# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]]
+# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]]
+# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]]
+# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]]
+# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]]
+# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]]
+# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]]
+# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]]
+# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]]
+# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]]
+# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]]
+# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]]
+# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]]
+# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]]
+# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]]
+# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]]
+# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]]
+# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]]
+# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]]
+# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]]
+# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]]
+# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]]
+# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]]
+# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]]
+# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]]
+# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]]
+# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]]
+# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]]
+# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]]
+# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]]
+# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]]
+# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]]
+# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]]
+# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]]
+# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]]
+# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]]
+# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]]
+# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]]
+# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]]
+# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]]
+# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]]
+# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]]
+# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]]
+# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]]
+# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]]
+# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]]
+# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]]
+# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]]
+# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]]
+# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]]
+# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]]
+# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]]
+# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]]
+# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]]
+# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]]
+# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]]
+# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]]
+# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]]
+# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]]
+# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]]
+# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]]
+# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]]
+# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]]
+# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]]
+# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]]
+# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]]
+# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]]
+# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]]
+# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]]
+# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]]
+# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]]
+# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]]
+# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]]
+# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]]
+# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]]
+# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]]
+# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]]
+# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]]
+# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]]
+# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]]
+# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]]
+# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]]
+# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]]
+# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]]
+# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]]
+# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]]
+# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]]
+# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]]
+# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]]
+# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]]
+# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]]
+# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]]
+# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]]
+# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]]
+# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]]
+
+
+--- |
+ source_filename = ".\main.ll"
+ define amdgpu_ps void @main() #1 {
+ ret void
+ }
+ attributes #1 = { "target-cpu"="gfx1010" }
+ !llvm.ident = !{!0}
+ !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name: main
+tracksRegLiveness: true
+liveins:
+ - { reg: '$sgpr0' }
+ - { reg: '$sgpr1' }
+ - { reg: '$sgpr2' }
+ - { reg: '$sgpr3' }
+ - { reg: '$sgpr4' }
+ - { reg: '$sgpr5' }
+ - { reg: '$sgpr6' }
+ - { reg: '$sgpr7' }
+ - { reg: '$sgpr8' }
+ - { reg: '$sgpr8' }
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+ undef %0.sub0:sgpr_64 = COPY $sgpr0
+ undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+ undef %1.sub0:sgpr_128 = COPY $sgpr4
+ undef %1.sub1:sgpr_128 = COPY $sgpr5
+ undef %1.sub2:sgpr_128 = COPY $sgpr6
+ undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+ %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
+ %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode
+ %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode
+ %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode
+ %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode
+ %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode
+ %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode
+ %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode
+ %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode
+ %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode
+ %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode
+ %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode
+ %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode
+ %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode
+ %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode
+ %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode
+ %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode
+ %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode
+ %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode
+ %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode
+ %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode
+ %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode
+ %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode
+ %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode
+ %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode
+ %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode
+ %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode
+ %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode
+ %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode
+ %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode
+ %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode
+ %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode
+ %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode
+ %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode
+ %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode
+ %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode
+ %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode
+ %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode
+ %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode
+ %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode
+ %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode
+ %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode
+ %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode
+ %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode
+ %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode
+ %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode
+ %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode
+ %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode
+ %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode
+ %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode
+ %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode
+ %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode
+ %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode
+ %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode
+ %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode
+ %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode
+ %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode
+ %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode
+ %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode
+ %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode
+ %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode
+ %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode
+ %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode
+ %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode
+ %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode
+ %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode
+ %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode
+ %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode
+ %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode
+ %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode
+ %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode
+ %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode
+ %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode
+ %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode
+ %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode
+ %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode
+ %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode
+ %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode
+ %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode
+ %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode
+ %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode
+ %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode
+ %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode
+ %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode
+ %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode
+ %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode
+ %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode
+ %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode
+ %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode
+ %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode
+ %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode
+ %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode
+ %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode
+ %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode
+ %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode
+ %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode
+ %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode
+ %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode
+ %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode
+ %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode
+ %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode
+ %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode
+ %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode
+ %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode
+ %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode
+ %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode
+ %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode
+ %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode
+ %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode
+ %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode
+ %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode
+ %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode
+ %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode
+ %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode
+ %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode
+ %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode
+ %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode
+ %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode
+ %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode
+ %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode
+ %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode
+ %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode
+ %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode
+ %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode
+ %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode
+ %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode
+ %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode
+ %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode
+ %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode
+ %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode
+ %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode
+ %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode
+ %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode
+
+
+ %8000:vgpr_32 = IMPLICIT_DEF
+ %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+ $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+
+ %8001:vgpr_32 = COPY %8000
+ %8002:vgpr_32 = COPY %8000
+ %8003:vgpr_32 = COPY %8000
+ %8004:vgpr_32 = COPY %8000
+ %8005:vgpr_32 = COPY %8000
+ %8006:vgpr_32 = COPY %8000
+ %8007:vgpr_32 = COPY %8000
+ %8008:vgpr_32 = COPY %8000
+ %8009:vgpr_32 = COPY %8000
+ %8010:vgpr_32 = COPY %8000
+ %8011:vgpr_32 = COPY %8000
+ %8012:vgpr_32 = COPY %8000
+ %8013:vgpr_32 = COPY %8000
+ %8014:vgpr_32 = COPY %8000
+ %8015:vgpr_32 = COPY %8000
+ %8016:vgpr_32 = COPY %8000
+ %8017:vgpr_32 = COPY %8000
+
+ %9001:vgpr_32 = COPY %8001
+ %9002:vgpr_32 = COPY %8002
+ %9003:vgpr_32 = COPY %8003
+ %9004:vgpr_32 = COPY %8004
+ %9005:vgpr_32 = COPY %8005
+ %9006:vgpr_32 = COPY %8006
+ %9007:vgpr_32 = COPY %8007
+ %9008:vgpr_32 = COPY %8008
+ %9009:vgpr_32 = COPY %8009
+ %9010:vgpr_32 = COPY %8010
+ %9011:vgpr_32 = COPY %8011
+ %9012:vgpr_32 = COPY %8012
+ %9013:vgpr_32 = COPY %8013
+ %9014:vgpr_32 = COPY %8014
+ %9015:vgpr_32 = COPY %8015
+ %9016:vgpr_32 = COPY %8016
+ %9017:vgpr_32 = COPY %8017
+
+ S_BRANCH %bb.2
+
+ bb.2:
+
+ %3:vgpr_32 = IMPLICIT_DEF
+
+ EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+ S_ENDPGM 0
+...
+
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
new file mode 100644
index 000000000000000..637a683bdd041d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
@@ -0,0 +1,641 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat -amdgpu-remat-enable-sub-exp-remat-aggressive | FileCheck %s
+
+# Check that the whole expression gets CLONED to uses in bb.2.
+# CHECK: bb.0:
+# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0
+# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1
+# CHECK: bb.1:
+# CHECK: bb.2:
+# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]]
+# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]]
+# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]]
+# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]]
+# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]]
+# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]]
+# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]]
+# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]]
+# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]]
+# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]]
+# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]]
+# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]]
+# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]]
+# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]]
+# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]]
+# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]]
+# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]]
+# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]]
+# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]]
+# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]]
+# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]]
+# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]]
+# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]]
+# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]]
+# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]]
+# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]]
+# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]]
+# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]]
+# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]]
+# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]]
+# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]]
+# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]]
+# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]]
+# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]]
+# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]]
+# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]]
+# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]]
+# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]]
+# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]]
+# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]]
+# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]]
+# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]]
+# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]]
+# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]]
+# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]]
+# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]]
+# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]]
+# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]]
+# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]]
+# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]]
+# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]]
+# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]]
+# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]]
+# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]]
+# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]]
+# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]]
+# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]]
+# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]]
+# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]]
+# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]]
+# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]]
+# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]]
+# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]]
+# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]]
+# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]]
+# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]]
+# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]]
+# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]]
+# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]]
+# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]]
+# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]]
+# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]]
+# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]]
+# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]]
+# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]]
+# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]]
+# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]]
+# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]]
+# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]]
+# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]]
+# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]]
+# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]]
+# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]]
+# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]]
+# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]]
+# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]]
+# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]]
+# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]]
+# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]]
+# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]]
+# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]]
+# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]]
+# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]]
+# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]]
+# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]]
+# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]]
+# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]]
+# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]]
+# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]]
+# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]]
+# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]]
+# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]]
+# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]]
+# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]]
+# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]]
+# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]]
+# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]]
+# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]]
+# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]]
+# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]]
+# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]]
+# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]]
+# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]]
+# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]]
+# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]]
+# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]]
+# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]]
+# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]]
+# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]]
+# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]]
+# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]]
+# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]]
+# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]]
+# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]]
+# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]]
+# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]]
+# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]]
+# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]]
+# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]]
+# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]]
+# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]]
+# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]]
+
+
+--- |
+ source_filename = ".\main.ll"
+ define amdgpu_ps void @main() #1 {
+ ret void
+ }
+ attributes #1 = { "target-cpu"="gfx1010" }
+ !llvm.ident = !{!0}
+ !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name: main
+tracksRegLiveness: true
+liveins:
+ - { reg: '$sgpr0' }
+ - { reg: '$sgpr1' }
+ - { reg: '$sgpr2' }
+ - { reg: '$sgpr3' }
+ - { reg: '$sgpr4' }
+ - { reg: '$sgpr5' }
+ - { reg: '$sgpr6' }
+ - { reg: '$sgpr7' }
+ - { reg: '$sgpr8' }
+ - { reg: '$sgpr8' }
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+ undef %0.sub0:sgpr_64 = COPY $sgpr0
+ undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+ undef %1.sub0:sgpr_128 = COPY $sgpr4
+ undef %1.sub1:sgpr_128 = COPY $sgpr5
+ undef %1.sub2:sgpr_128 = COPY $sgpr6
+ undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+ %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
+ %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode
+ %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode
+ %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode
+ %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode
+ %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode
+ %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode
+ %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode
+ %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode
+ %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode
+ %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode
+ %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode
+ %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode
+ %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode
+ %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode
+ %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode
+ %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode
+ %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode
+ %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode
+ %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode
+ %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode
+ %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode
+ %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode
+ %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode
+ %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode
+ %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode
+ %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode
+ %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode
+ %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode
+ %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode
+ %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode
+ %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode
+ %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode
+ %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode
+ %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode
+ %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode
+ %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode
+ %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode
+ %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode
+ %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode
+ %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode
+ %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode
+ %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode
+ %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode
+ %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode
+ %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode
+ %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode
+ %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode
+ %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode
+ %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode
+ %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode
+ %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode
+ %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode
+ %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode
+ %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode
+ %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode
+ %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode
+ %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode
+ %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode
+ %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode
+ %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode
+ %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode
+ %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode
+ %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode
+ %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode
+ %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode
+ %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode
+ %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode
+ %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode
+ %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode
+ %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode
+ %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode
+ %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode
+ %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode
+ %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode
+ %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode
+ %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode
+ %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode
+ %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode
+ %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode
+ %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode
+ %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode
+ %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode
+ %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode
+ %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode
+ %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode
+ %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode
+ %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode
+ %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode
+ %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode
+ %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode
+ %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode
+ %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode
+ %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode
+ %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode
+ %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode
+ %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode
+ %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode
+ %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode
+ %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode
+ %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode
+ %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode
+ %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode
+ %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode
+ %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode
+ %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode
+ %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode
+ %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode
+ %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode
+ %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode
+ %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode
+ %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode
+ %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode
+ %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode
+ %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode
+ %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode
+ %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode
+ %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode
+ %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode
+ %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode
+ %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode
+ %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode
+ %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode
+ %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode
+ %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode
+ %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode
+ %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode
+ %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode
+ %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode
+ %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode
+ %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode
+ %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode
+ %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode
+ EXP 0, %500, %500, %500, %500, -1, -1, 15, implicit $exec
+ EXP 0, %501, %501, %501, %501, -1, -1, 15, implicit $exec
+ EXP 0, %502, %502, %502, %502, -1, -1, 15, implicit $exec
+ EXP 0, %503, %503, %503, %503, -1, -1, 15, implicit $exec
+ EXP 0, %504, %504, %504, %504, -1, -1, 15, implicit $exec
+ EXP 0, %505, %505, %505, %505, -1, -1, 15, implicit $exec
+ EXP 0, %506, %506, %506, %506, -1, -1, 15, implicit $exec
+ EXP 0, %507, %507, %507, %507, -1, -1, 15, implicit $exec
+ EXP 0, %508, %508, %508, %508, -1, -1, 15, implicit $exec
+ EXP 0, %509, %509, %509, %509, -1, -1, 15, implicit $exec
+ EXP 0, %5010, %5010, %5010, %5010, -1, -1, 15, implicit $exec
+ EXP 0, %5011, %5011, %5011, %5011, -1, -1, 15, implicit $exec
+ EXP 0, %5012, %5012, %5012, %5012, -1, -1, 15, implicit $exec
+ EXP 0, %5013, %5013, %5013, %5013, -1, -1, 15, implicit $exec
+ EXP 0, %5014, %5014, %5014, %5014, -1, -1, 15, implicit $exec
+ EXP 0, %5015, %5015, %5015, %5015, -1, -1, 15, implicit $exec
+ EXP 0, %5016, %5016, %5016, %5016, -1, -1, 15, implicit $exec
+ EXP 0, %5017, %5017, %5017, %5017, -1, -1, 15, implicit $exec
+ EXP 0, %5018, %5018, %5018, %5018, -1, -1, 15, implicit $exec
+ EXP 0, %5019, %5019, %5019, %5019, -1, -1, 15, implicit $exec
+ EXP 0, %5020, %5020, %5020, %5020, -1, -1, 15, implicit $exec
+ EXP 0, %5021, %5021, %5021, %5021, -1, -1, 15, implicit $exec
+ EXP 0, %5022, %5022, %5022, %5022, -1, -1, 15, implicit $exec
+ EXP 0, %5023, %5023, %5023, %5023, -1, -1, 15, implicit $exec
+ EXP 0, %5024, %5024, %5024, %5024, -1, -1, 15, implicit $exec
+ EXP 0, %5025, %5025, %5025, %5025, -1, -1, 15, implicit $exec
+ EXP 0, %5026, %5026, %5026, %5026, -1, -1, 15, implicit $exec
+ EXP 0, %5027, %5027, %5027, %5027, -1, -1, 15, implicit $exec
+ EXP 0, %5028, %5028, %5028, %5028, -1, -1, 15, implicit $exec
+ EXP 0, %5029, %5029, %5029, %5029, -1, -1, 15, implicit $exec
+ EXP 0, %5030, %5030, %5030, %5030, -1, -1, 15, implicit $exec
+ EXP 0, %5031, %5031, %5031, %5031, -1, -1, 15, implicit $exec
+ EXP 0, %5032, %5032, %5032, %5032, -1, -1, 15, implicit $exec
+ EXP 0, %5033, %5033, %5033, %5033, -1, -1, 15, implicit $exec
+ EXP 0, %5034, %5034, %5034, %5034, -1, -1, 15, implicit $exec
+ EXP 0, %5035, %5035, %5035, %5035, -1, -1, 15, implicit $exec
+ EXP 0, %5036, %5036, %5036, %5036, -1, -1, 15, implicit $exec
+ EXP 0, %5037, %5037, %5037, %5037, -1, -1, 15, implicit $exec
+ EXP 0, %5038, %5038, %5038, %5038, -1, -1, 15, implicit $exec
+ EXP 0, %5039, %5039, %5039, %5039, -1, -1, 15, implicit $exec
+ EXP 0, %5040, %5040, %5040, %5040, -1, -1, 15, implicit $exec
+ EXP 0, %5041, %5041, %5041, %5041, -1, -1, 15, implicit $exec
+ EXP 0, %5042, %5042, %5042, %5042, -1, -1, 15, implicit $exec
+ EXP 0, %5043, %5043, %5043, %5043, -1, -1, 15, implicit $exec
+ EXP 0, %5044, %5044, %5044, %5044, -1, -1, 15, implicit $exec
+ EXP 0, %5045, %5045, %5045, %5045, -1, -1, 15, implicit $exec
+ EXP 0, %5046, %5046, %5046, %5046, -1, -1, 15, implicit $exec
+ EXP 0, %5047, %5047, %5047, %5047, -1, -1, 15, implicit $exec
+ EXP 0, %5048, %5048, %5048, %5048, -1, -1, 15, implicit $exec
+ EXP 0, %5049, %5049, %5049, %5049, -1, -1, 15, implicit $exec
+ EXP 0, %5050, %5050, %5050, %5050, -1, -1, 15, implicit $exec
+ EXP 0, %5051, %5051, %5051, %5051, -1, -1, 15, implicit $exec
+ EXP 0, %5052, %5052, %5052, %5052, -1, -1, 15, implicit $exec
+ EXP 0, %5053, %5053, %5053, %5053, -1, -1, 15, implicit $exec
+ EXP 0, %5054, %5054, %5054, %5054, -1, -1, 15, implicit $exec
+ EXP 0, %5055, %5055, %5055, %5055, -1, -1, 15, implicit $exec
+ EXP 0, %5056, %5056, %5056, %5056, -1, -1, 15, implicit $exec
+ EXP 0, %5057, %5057, %5057, %5057, -1, -1, 15, implicit $exec
+ EXP 0, %5058, %5058, %5058, %5058, -1, -1, 15, implicit $exec
+ EXP 0, %5059, %5059, %5059, %5059, -1, -1, 15, implicit $exec
+ EXP 0, %5060, %5060, %5060, %5060, -1, -1, 15, implicit $exec
+ EXP 0, %5061, %5061, %5061, %5061, -1, -1, 15, implicit $exec
+ EXP 0, %5062, %5062, %5062, %5062, -1, -1, 15, implicit $exec
+ EXP 0, %5063, %5063, %5063, %5063, -1, -1, 15, implicit $exec
+ EXP 0, %5064, %5064, %5064, %5064, -1, -1, 15, implicit $exec
+ EXP 0, %5065, %5065, %5065, %5065, -1, -1, 15, implicit $exec
+ EXP 0, %5066, %5066, %5066, %5066, -1, -1, 15, implicit $exec
+ EXP 0, %5067, %5067, %5067, %5067, -1, -1, 15, implicit $exec
+ EXP 0, %5068, %5068, %5068, %5068, -1, -1, 15, implicit $exec
+ EXP 0, %5069, %5069, %5069, %5069, -1, -1, 15, implicit $exec
+ EXP 0, %5070, %5070, %5070, %5070, -1, -1, 15, implicit $exec
+ EXP 0, %5071, %5071, %5071, %5071, -1, -1, 15, implicit $exec
+ EXP 0, %5072, %5072, %5072, %5072, -1, -1, 15, implicit $exec
+ EXP 0, %5073, %5073, %5073, %5073, -1, -1, 15, implicit $exec
+ EXP 0, %5074, %5074, %5074, %5074, -1, -1, 15, implicit $exec
+ EXP 0, %5075, %5075, %5075, %5075, -1, -1, 15, implicit $exec
+ EXP 0, %5076, %5076, %5076, %5076, -1, -1, 15, implicit $exec
+ EXP 0, %5077, %5077, %5077, %5077, -1, -1, 15, implicit $exec
+ EXP 0, %5078, %5078, %5078, %5078, -1, -1, 15, implicit $exec
+ EXP 0, %5079, %5079, %5079, %5079, -1, -1, 15, implicit $exec
+ EXP 0, %5080, %5080, %5080, %5080, -1, -1, 15, implicit $exec
+ EXP 0, %5081, %5081, %5081, %5081, -1, -1, 15, implicit $exec
+ EXP 0, %5082, %5082, %5082, %5082, -1, -1, 15, implicit $exec
+ EXP 0, %5083, %5083, %5083, %5083, -1, -1, 15, implicit $exec
+ EXP 0, %5084, %5084, %5084, %5084, -1, -1, 15, implicit $exec
+ EXP 0, %5085, %5085, %5085, %5085, -1, -1, 15, implicit $exec
+ EXP 0, %5086, %5086, %5086, %5086, -1, -1, 15, implicit $exec
+ EXP 0, %5087, %5087, %5087, %5087, -1, -1, 15, implicit $exec
+ EXP 0, %5088, %5088, %5088, %5088, -1, -1, 15, implicit $exec
+ EXP 0, %5089, %5089, %5089, %5089, -1, -1, 15, implicit $exec
+ EXP 0, %5090, %5090, %5090, %5090, -1, -1, 15, implicit $exec
+ EXP 0, %5091, %5091, %5091, %5091, -1, -1, 15, implicit $exec
+ EXP 0, %5092, %5092, %5092, %5092, -1, -1, 15, implicit $exec
+ EXP 0, %5093, %5093, %5093, %5093, -1, -1, 15, implicit $exec
+ EXP 0, %5094, %5094, %5094, %5094, -1, -1, 15, implicit $exec
+ EXP 0, %5095, %5095, %5095, %5095, -1, -1, 15, implicit $exec
+ EXP 0, %5096, %5096, %5096, %5096, -1, -1, 15, implicit $exec
+ EXP 0, %5097, %5097, %5097, %5097, -1, -1, 15, implicit $exec
+ EXP 0, %5098, %5098, %5098, %5098, -1, -1, 15, implicit $exec
+ EXP 0, %5099, %5099, %5099, %5099, -1, -1, 15, implicit $exec
+ EXP 0, %50100, %50100, %50100, %50100, -1, -1, 15, implicit $exec
+ EXP 0, %50101, %50101, %50101, %50101, -1, -1, 15, implicit $exec
+ EXP 0, %50102, %50102, %50102, %50102, -1, -1, 15, implicit $exec
+ EXP 0, %50103, %50103, %50103, %50103, -1, -1, 15, implicit $exec
+ EXP 0, %50104, %50104, %50104, %50104, -1, -1, 15, implicit $exec
+ EXP 0, %50105, %50105, %50105, %50105, -1, -1, 15, implicit $exec
+ EXP 0, %50106, %50106, %50106, %50106, -1, -1, 15, implicit $exec
+ EXP 0, %50107, %50107, %50107, %50107, -1, -1, 15, implicit $exec
+ EXP 0, %50108, %50108, %50108, %50108, -1, -1, 15, implicit $exec
+ EXP 0, %50109, %50109, %50109, %50109, -1, -1, 15, implicit $exec
+ EXP 0, %50110, %50110, %50110, %50110, -1, -1, 15, implicit $exec
+ EXP 0, %50111, %50111, %50111, %50111, -1, -1, 15, implicit $exec
+ EXP 0, %50112, %50112, %50112, %50112, -1, -1, 15, implicit $exec
+ EXP 0, %50113, %50113, %50113, %50113, -1, -1, 15, implicit $exec
+ EXP 0, %50114, %50114, %50114, %50114, -1, -1, 15, implicit $exec
+ EXP 0, %50115, %50115, %50115, %50115, -1, -1, 15, implicit $exec
+ EXP 0, %50116, %50116, %50116, %50116, -1, -1, 15, implicit $exec
+ EXP 0, %50117, %50117, %50117, %50117, -1, -1, 15, implicit $exec
+ EXP 0, %50118, %50118, %50118, %50118, -1, -1, 15, implicit $exec
+ EXP 0, %50119, %50119, %50119, %50119, -1, -1, 15, implicit $exec
+ EXP 0, %50120, %50120, %50120, %50120, -1, -1, 15, implicit $exec
+ EXP 0, %50121, %50121, %50121, %50121, -1, -1, 15, implicit $exec
+ EXP 0, %50122, %50122, %50122, %50122, -1, -1, 15, implicit $exec
+ EXP 0, %50123, %50123, %50123, %50123, -1, -1, 15, implicit $exec
+ EXP 0, %50124, %50124, %50124, %50124, -1, -1, 15, implicit $exec
+ EXP 0, %50125, %50125, %50125, %50125, -1, -1, 15, implicit $exec
+ EXP 0, %50126, %50126, %50126, %50126, -1, -1, 15, implicit $exec
+ EXP 0, %50127, %50127, %50127, %50127, -1, -1, 15, implicit $exec
+ EXP 0, %50128, %50128, %50128, %50128, -1, -1, 15, implicit $exec
+ EXP 0, %50129, %50129, %50129, %50129, -1, -1, 15, implicit $exec
+ EXP 0, %50130, %50130, %50130, %50130, -1, -1, 15, implicit $exec
+ EXP 0, %50131, %50131, %50131, %50131, -1, -1, 15, implicit $exec
+ EXP 0, %50132, %50132, %50132, %50132, -1, -1, 15, implicit $exec
+ EXP 0, %50133, %50133, %50133, %50133, -1, -1, 15, implicit $exec
+
+
+ %8000:vgpr_32 = IMPLICIT_DEF
+ %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+ $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+
+ %8001:vgpr_32 = COPY %8000
+ %8002:vgpr_32 = COPY %8000
+ %8003:vgpr_32 = COPY %8000
+ %8004:vgpr_32 = COPY %8000
+ %8005:vgpr_32 = COPY %8000
+ %8006:vgpr_32 = COPY %8000
+ %8007:vgpr_32 = COPY %8000
+ %8008:vgpr_32 = COPY %8000
+ %8009:vgpr_32 = COPY %8000
+ %8010:vgpr_32 = COPY %8000
+ %8011:vgpr_32 = COPY %8000
+ %8012:vgpr_32 = COPY %8000
+ %8013:vgpr_32 = COPY %8000
+ %8014:vgpr_32 = COPY %8000
+ %8015:vgpr_32 = COPY %8000
+ %8016:vgpr_32 = COPY %8000
+ %8017:vgpr_32 = COPY %8000
+
+ %9001:vgpr_32 = COPY %8001
+ %9002:vgpr_32 = COPY %8002
+ %9003:vgpr_32 = COPY %8003
+ %9004:vgpr_32 = COPY %8004
+ %9005:vgpr_32 = COPY %8005
+ %9006:vgpr_32 = COPY %8006
+ %9007:vgpr_32 = COPY %8007
+ %9008:vgpr_32 = COPY %8008
+ %9009:vgpr_32 = COPY %8009
+ %9010:vgpr_32 = COPY %8010
+ %9011:vgpr_32 = COPY %8011
+ %9012:vgpr_32 = COPY %8012
+ %9013:vgpr_32 = COPY %8013
+ %9014:vgpr_32 = COPY %8014
+ %9015:vgpr_32 = COPY %8015
+ %9016:vgpr_32 = COPY %8016
+ %9017:vgpr_32 = COPY %8017
+
+ S_BRANCH %bb.2
+
+ bb.2:
+
+ %3:vgpr_32 = IMPLICIT_DEF
+
+ EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+ S_ENDPGM 0
+...
+
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
new file mode 100644
index 000000000000000..bc2c97f91f46c67
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -0,0 +1,450 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that the loads have been moved to the use
+# CHECK: bb.2:
+# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
+# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0
+# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0
+# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0
+# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0
+# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0
+# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0
+# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0
+# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0
+# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0
+# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0
+# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0
+# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0
+# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0
+# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
+# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
+# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
+# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
+# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
+# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
+# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
+# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
+# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
+# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
+# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
+# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
+# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
+# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
+# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
+# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
+# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
+# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
+# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
+# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
+# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
+# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
+# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
+# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
+# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
+# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
+# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
+# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
+# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
+# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
+# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
+# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
+# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
+# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
+# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
+# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
+# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
+# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
+# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
+# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
+# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
+# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
+# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
+# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
+# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
+# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
+# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
+# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
+# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
+# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+
+
+--- |
+ source_filename = ".\main.ll"
+ define amdgpu_ps void @main() #1 {
+ ret void
+ }
+ attributes #1 = { "target-cpu"="gfx1010" }
+ !llvm.ident = !{!0}
+ !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name: main
+tracksRegLiveness: true
+liveins:
+ - { reg: '$sgpr0' }
+ - { reg: '$sgpr1' }
+ - { reg: '$sgpr2' }
+ - { reg: '$sgpr3' }
+ - { reg: '$sgpr4' }
+ - { reg: '$sgpr5' }
+ - { reg: '$sgpr6' }
+ - { reg: '$sgpr7' }
+ - { reg: '$sgpr8' }
+ - { reg: '$sgpr8' }
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+ undef %0.sub0:sgpr_64 = COPY $sgpr0
+ undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+ undef %1.sub0:sgpr_128 = COPY $sgpr4
+ undef %1.sub1:sgpr_128 = COPY $sgpr5
+ undef %1.sub2:sgpr_128 = COPY $sgpr6
+ undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+ %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0
+ %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0
+ %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0
+ %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0
+ %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0
+ %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0
+ %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0
+ %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0
+ %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0
+ %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0
+ %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0
+ %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0
+ %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0
+ %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0
+ %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0
+ %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0
+ %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0
+ %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0
+ %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0
+ %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0
+ %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0
+ %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0
+ %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0
+ %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0
+ %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0
+ %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0
+ %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0
+ %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0
+ %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0
+ %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0
+ %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0
+ %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0
+ %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0
+ %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0
+ %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0
+ %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0
+ %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0
+ %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0
+ %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0
+ %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0
+ %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0
+ %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0
+ %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0
+ %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0
+ %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0
+ %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0
+ %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0
+ %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0
+ %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0
+ %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0
+ %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0
+ %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0
+ %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0
+ %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0
+ %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0
+ %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0
+ %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0
+ %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0
+ %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0
+ %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0
+ %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0
+ %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0
+ %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0
+ %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0
+
+ %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+
+ %8000:vgpr_32 = IMPLICIT_DEF
+ %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+ $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+ %8001:vgpr_32 = COPY %8000
+ S_BRANCH %bb.2
+
+ bb.2:
+
+ %3:vgpr_32 = IMPLICIT_DEF
+ S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
+ S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
+
+ EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
+ EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+ S_ENDPGM 0
+...
>From a13cfc4dcc49c810182bf5ca2bd3b3f0a40c75cd Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Thu, 6 Feb 2025 14:09:32 -0800
Subject: [PATCH 3/3] Test renames, only keeping the required flags for the
tests
---
.../remat/{group_remat_with_uses.mir => group_remat_clone.mir} | 2 +-
.../AMDGPU/remat/{group_remat.mir => group_remat_move.mir} | 0
llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 2 +-
llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir | 2 +-
4 files changed, 3 insertions(+), 3 deletions(-)
rename llvm/test/CodeGen/AMDGPU/remat/{group_remat_with_uses.mir => group_remat_clone.mir} (99%)
rename llvm/test/CodeGen/AMDGPU/remat/{group_remat.mir => group_remat_move.mir} (100%)
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
rename to llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
index 637a683bdd041d4..c99a1835454fd1c 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
@@ -638,4 +638,4 @@ body: |
S_ENDPGM 0
...
-
\ No newline at end of file
+
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
rename to llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
index bc2c97f91f46c67..528515d235c8b60 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -1,6 +1,6 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
-# Check that the loads have been moved to the use
+# Check that the scalar loads have been moved to the use
# CHECK: bb.2:
# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
index e8a66b47ac732b5..53f59cc3f8b0b09 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-hot-block-remat-aggressive -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
# DEFS
# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
More information about the llvm-commits
mailing list