[llvm] [AMDGPU][Scheduler] Refactor VGPR rematerialization during scheduling (PR #118722)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 7 03:22:54 PST 2025
https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/118722
>From e5abab8b2143a4813359097fded2e1965ebaf591 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 4 Dec 2024 15:49:33 +0100
Subject: [PATCH 01/11] Working refactoring of simple VGPR remat.
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 404 +++++++++---------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 56 ++-
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 13 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 6 +
.../machine-scheduler-sink-trivial-remats.mir | 145 ++++++-
7 files changed, 418 insertions(+), 215 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57f517bfba0ebb..1b39a5a7db7192 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,8 +25,13 @@
#include "GCNSchedStrategy.h"
#include "AMDGPUIGroupLP.h"
+#include "GCNRegPressure.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/ErrorHandling.h"
#define DEBUG_TYPE "machine-scheduler"
@@ -945,20 +950,19 @@ bool PreRARematStage::initGCNSchedStage() {
return false;
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
- // Check maximum occupancy
+ // Rematerialization will not help if occupancy is LDS-limited.
if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
DAG.MinOccupancy)
return false;
// FIXME: This pass will invalidate cached MBBLiveIns for regions
- // inbetween the defs and region we sinked the def to. Cached pressure
- // for regions where a def is sinked from will also be invalidated. Will
- // need to be fixed if there is another pass after this pass.
+ // inbetween the defs and region we sinked the def to. Will need to be fixed
+ // if there is another pass after this pass.
assert(!S.hasNextStage());
- collectRematerializableInstructions();
- if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+ if (!collectRematerializableInstructions())
return false;
+ sinkTriviallyRematInsts(ST, TII);
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with improved occupancy of "
@@ -1467,231 +1471,249 @@ void GCNSchedStage::revertScheduling() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
}
-void PreRARematStage::collectRematerializableInstructions() {
+/// Allows to easily filter for this stage's debug output.
+#define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+
+bool PreRARematStage::collectRematerializableInstructions() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
- for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
- Register Reg = Register::index2VirtReg(I);
- if (!DAG.LIS->hasInterval(Reg))
- continue;
- // TODO: Handle AGPR and SGPR rematerialization
- if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
- !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
- continue;
+ RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
- MachineOperand *Op = DAG.MRI.getOneDef(Reg);
- MachineInstr *Def = Op->getParent();
- if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
+ // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+ // occupancy) to the numbers of VGPRs that must be deducted from their maximum
+ // VGPR pressure for their occupancy to be increased by one.
+ DenseMap<unsigned, unsigned> OptRegions;
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (!DAG.RegionsWithMinOcc[I])
continue;
+ GCNRegPressure &RP = DAG.Pressure[I];
- MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
- if (Def->getParent() == UseI->getParent())
+ // We do not rematerialize SGPR-defining regions yet so do not bother
+ // optimizing regions whose occupancy is SGPR-limited.
+ if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy)
continue;
- // We are only collecting defs that are defined in another block and are
- // live-through or used inside regions at MinOccupancy. This means that the
- // register must be in the live-in set for the region.
- bool AddedToRematList = false;
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- auto It = DAG.LiveIns[I].find(Reg);
- if (It != DAG.LiveIns[I].end() && !It->second.none()) {
- if (DAG.RegionsWithMinOcc[I]) {
- RematerializableInsts[I][Def] = UseI;
- AddedToRematList = true;
- }
-
- // Collect regions with rematerializable reg as live-in to avoid
- // searching later when updating RP.
- RematDefToLiveInRegions[Def].push_back(I);
- }
- }
- if (!AddedToRematList)
- RematDefToLiveInRegions.erase(Def);
+ unsigned NumVGPRs = RP.getVGPRNum(ST.hasGFX90AInsts());
+ unsigned NumToIncreaseOcc = ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs);
+ OptRegions.insert({I, NumToIncreaseOcc});
+ RA_DEBUG(dbgs() << "Region " << I << " has min. occupancy: decrease by "
+ << NumToIncreaseOcc << " VGPR(s) to improve occupancy\n");
}
-}
-
-bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
- const TargetInstrInfo *TII) {
- // Temporary copies of cached variables we will be modifying and replacing if
- // sinking succeeds.
- SmallVector<
- std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
- NewRegions;
- DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
- DenseMap<unsigned, GCNRegPressure> NewPressure;
- BitVector NewRescheduleRegions;
- LiveIntervals *LIS = DAG.LIS;
-
- NewRegions.resize(DAG.Regions.size());
- NewRescheduleRegions.resize(DAG.Regions.size());
-
- // Collect only regions that has a rematerializable def as a live-in.
- SmallSet<unsigned, 16> ImpactedRegions;
- for (const auto &It : RematDefToLiveInRegions)
- ImpactedRegions.insert(It.second.begin(), It.second.end());
-
- // Make copies of register pressure and live-ins cache that will be updated
- // as we rematerialize.
- for (auto Idx : ImpactedRegions) {
- NewPressure[Idx] = DAG.Pressure[Idx];
- NewLiveIns[Idx] = DAG.LiveIns[Idx];
- }
- NewRegions = DAG.Regions;
- NewRescheduleRegions.reset();
+ if (OptRegions.empty())
+ return false;
- DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
- bool Improved = false;
- for (auto I : ImpactedRegions) {
- if (!DAG.RegionsWithMinOcc[I])
- continue;
+ // Tracks estimated rematerialization gains (i.e., reduction in RP) for
+ // this instruction in each optimizable region.
+ auto ReduceRPInRegion = [&](auto OptIt, unsigned I,
+ LaneBitmask Mask) -> bool {
+ auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+ unsigned &RPExcess = OptIt->getSecond();
+ if (NumRegs >= RPExcess) {
+ OptRegions.erase(I);
+ LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
+ << "\n");
+ } else {
+ RPExcess -= NumRegs;
+ LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
+ << " by " << NumRegs << " (" << RPExcess << " left)\n");
+ }
+ return OptRegions.empty();
+ };
+
+ // We need up-to-date live-out info. to query live-out register masks in
+ // regions containing rematerializable instructions.
+ DAG.RegionLiveOuts.buildLiveRegMap();
+
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ auto Region = DAG.Regions[I];
+ for (auto MI = Region.first; MI != Region.second; ++MI) {
+ // We only support instructions with at least one register (but
+ // non-subregister) operand.
+ MachineInstr &DefMI = *MI;
+ if (DefMI.isBundle() || !DefMI.getNumOperands() ||
+ !DefMI.getOperand(0).isReg() || DefMI.getOperand(0).getSubReg())
+ continue;
- Improved = false;
- int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
- int SGPRUsage = NewPressure[I].getSGPRNum();
+ // We only support rematerializing virtual VGPRs with one definition and
+ // one use.
+ Register Reg = DefMI.getOperand(0).getReg();
+ if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
+ !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+ !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
+ continue;
- // TODO: Handle occupancy drop due to AGPR and SGPR.
- // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
- if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
- break;
+ // The instruction must be trivially rematerializable and have no virtual
+ // register use.
+ if (!isTriviallyReMaterializable(DefMI))
+ continue;
- // The occupancy of this region could have been improved by a previous
- // iteration's sinking of defs.
- if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
- NewRescheduleRegions[I] = true;
- Improved = true;
- continue;
- }
+ // We only care to rematerialize the instruction if its single use is in a
+ // different block.
+ MachineInstr *UseMI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
+ if (DefMI.getParent() == UseMI->getParent())
+ continue;
- // First check if we have enough trivially rematerializable instructions to
- // improve occupancy. Optimistically assume all instructions we are able to
- // sink decreased RP.
- int TotalSinkableRegs = 0;
- for (const auto &It : RematerializableInsts[I]) {
- MachineInstr *Def = It.first;
- Register DefReg = Def->getOperand(0).getReg();
- TotalSinkableRegs +=
- SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
- }
- int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
- unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
- // If in the most optimistic scenario, we cannot improve occupancy, then do
- // not attempt to sink any instructions.
- if (OptimisticOccupancy <= DAG.MinOccupancy)
- break;
+ RA_DEBUG(dbgs() << "In region " << I << ", instruction " << DefMI
+ << " is rematerializable with single use " << *UseMI);
+ auto &Remat = RematInstructions.emplace_back(&DefMI, I, UseMI);
+
+ bool RematUseful = false;
+ if (auto It = OptRegions.find(I); It != OptRegions.end()) {
+ // Optimistically consider that moving the instruction out of its
+ // defining region will reduce RP in the latter; this assumes that
+ // maximum RP in the region is reached somewhere between the defining
+ // instruction and the end of the region.
+ RA_DEBUG(dbgs() << " Instruction's defining region is optimizable: ");
+ RematUseful = true;
+ auto RegMask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+ if (ReduceRPInRegion(It, I, RegMask))
+ return true;
+ }
- unsigned ImproveOccupancy = 0;
- SmallVector<MachineInstr *, 4> SinkedDefs;
- for (auto &It : RematerializableInsts[I]) {
- MachineInstr *Def = It.first;
- MachineBasicBlock::iterator InsertPos =
- MachineBasicBlock::iterator(It.second);
- Register Reg = Def->getOperand(0).getReg();
- // Rematerialize MI to its use block. Since we are only rematerializing
- // instructions that do not have any virtual reg uses, we do not need to
- // call LiveRangeEdit::allUsesAvailableAt() and
- // LiveRangeEdit::canRematerializeAt().
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
- Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
- MachineInstr *NewMI = &*std::prev(InsertPos);
- LIS->InsertMachineInstrInMaps(*NewMI);
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
- InsertedMIToOldDef[NewMI] = Def;
-
- // Update region boundaries in scheduling region we sinked from since we
- // may sink an instruction that was at the beginning or end of its region
- DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
- /*Removing =*/true);
-
- // Update region boundaries in region we sinked to.
- DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
-
- LaneBitmask PrevMask = NewLiveIns[I][Reg];
- // FIXME: Also update cached pressure for where the def was sinked from.
- // Update RP for all regions that has this reg as a live-in and remove
- // the reg from all regions as a live-in.
- for (auto Idx : RematDefToLiveInRegions[Def]) {
- NewLiveIns[Idx].erase(Reg);
- if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
- // Def is live-through and not used in this block.
- NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+ for (unsigned LVRegion = 0; LVRegion != E; ++LVRegion) {
+ // We are only collecting regions in which the register is a live-in
+ // (and may be live-through).
+ auto It = DAG.LiveIns[LVRegion].find(Reg);
+ if (It == DAG.LiveIns[LVRegion].end() || It->second.none())
+ continue;
+ Remat.LiveInRegions.insert(LVRegion);
+ RA_DEBUG(dbgs() << " Def is live-in in region " << LVRegion
+ << ": ");
+
+ // Account for the reduction in RP due to the rematerialization in an
+ // optimizable region in which the defined register is a live-in. This
+ // is exact for live-through region but optimistic in the using region,
+ // where RP is actually reduced only if maximum RP is reached somewhere
+ // between the beginning of the region and the rematerializable
+ // instruction's use.
+ if (auto It = OptRegions.find(LVRegion); It != OptRegions.end()) {
+ RematUseful = true;
+ if (ReduceRPInRegion(It, LVRegion, DAG.LiveIns[LVRegion][Reg]))
+ return true;
} else {
- // Def is used and rematerialized into this block.
- GCNDownwardRPTracker RPT(*LIS);
- auto *NonDbgMI = &*skipDebugInstructionsForward(
- NewRegions[Idx].first, NewRegions[Idx].second);
- RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
- RPT.advance(NewRegions[Idx].second);
- NewPressure[Idx] = RPT.moveMaxPressure();
+ LLVM_DEBUG(dbgs() << "unoptimizable region\n");
}
}
- SinkedDefs.push_back(Def);
- ImproveOccupancy = NewPressure[I].getOccupancy(ST);
- if (ImproveOccupancy > DAG.MinOccupancy)
- break;
+ // If the instruction is not a live-in or live-out in any optimizable
+ // region then there is no point in rematerializing it.
+ if (!RematUseful) {
+ RematInstructions.pop_back();
+ RA_DEBUG(
+ dbgs()
+ << " No impact on any optimizable region, dropping instruction\n");
+ }
}
+ }
- // Remove defs we just sinked from all regions' list of sinkable defs
- for (auto &Def : SinkedDefs)
- for (auto TrackedIdx : RematDefToLiveInRegions[Def])
- RematerializableInsts[TrackedIdx].erase(Def);
+ RA_DEBUG(dbgs() << "Cannot increase occupancy through rematerialization\n");
+ return false;
+}
- if (ImproveOccupancy <= DAG.MinOccupancy)
- break;
+void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ const TargetInstrInfo *TII) {
+ // Collect regions whose live-ins or register pressure will change due to
+ // rematerialization, and map those whose maximum RP we need to fully
+ // recompute at the end to true.
+ SmallDenseMap<unsigned, bool> ImpactedRegions;
+ // Maps rematerialized instuctions to the one they were rematerialized from.
+ DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+ LiveIntervals *LIS = DAG.LIS;
- NewRescheduleRegions[I] = true;
- Improved = true;
+ // TODO: In the spirit of rematerializing the minimum number of instructions
+ // to increase occupancy, here we could sort the list of rematerializable
+ // instructions in decreasing order of "expected profitability" so that we end
+ // up moving as few instructions as possible in the loop below.
+ for (RematInstruction &Remat : RematInstructions) {
+ MachineInstr *DefMI = Remat.RematMI;
+ MachineBasicBlock::iterator InsertPos(Remat.UseMI);
+ Register Reg = DefMI->getOperand(0).getReg();
+
+ // Rematerialize MI to its use block. Since we are only rematerializing
+ // instructions that do not have any virtual reg uses, we do not need to
+ // call LiveRangeEdit::allUsesAvailableAt() and
+ // LiveRangeEdit::canRematerializeAt().
+ TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+ DefMI->getOperand(0).getSubReg(), *DefMI, *DAG.TRI);
+ MachineInstr *NewMI = &*std::prev(InsertPos);
+ LIS->InsertMachineInstrInMaps(*NewMI);
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ InsertedMIToOldDef[NewMI] = DefMI;
+
+ // Update region boundaries in scheduling region we sinked from since we
+ // may sink an instruction that was at the beginning or end of its region.
+ DAG.updateRegionBoundaries(DAG.Regions, DefMI, /*NewMI =*/nullptr,
+ /*Removing =*/true);
+
+ // Update region boundaries in region we sinked to.
+ DAG.updateRegionBoundaries(DAG.Regions, InsertPos, NewMI);
+
+ // Collect all regions impacted by the rematerialization and update their
+ // live-in/RP information.
+ for (unsigned I : Remat.LiveInRegions) {
+ ImpactedRegions.insert({I, false});
+ GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+
+ // The register is no longer a live-in in all regions but the one that
+ // contains the single use. In live-through regions, maximum register
+ // pressure decreases predictably so we can directly update it. In the
+ // using region, maximum register pressure may or may not decrease, so we
+ // will mark it for re-computation after all materializations.
+ RegionLiveIns.erase(Reg);
+ if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) {
+ // Register is live-through and not used in this block.
+ LaneBitmask PrevMask = RegionLiveIns[Reg];
+ DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+ } else {
+ // Register is used in this block.
+ ImpactedRegions[I] = true;
+ }
+ }
+
+ // RP in the region from which the instruction was rematerialized may or may
+ // not change, recompute-it fully later.
+ ImpactedRegions[Remat.DefRegion] = true;
}
- if (!Improved) {
- // Occupancy was not improved for all regions that were at MinOccupancy.
- // Undo sinking and remove newly rematerialized instructions.
- for (auto &Entry : InsertedMIToOldDef) {
- MachineInstr *MI = Entry.first;
- MachineInstr *OldMI = Entry.second;
- Register Reg = MI->getOperand(0).getReg();
- LIS->RemoveMachineInstrFromMaps(*MI);
- MI->eraseFromParent();
- OldMI->clearRegisterDeads(Reg);
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
+ // All regions impacted by at least one rematerialization must be rescheduled.
+ BitVector NewRescheduleRegions(DAG.Regions.size());
+ for (auto &[I, RecomputeRP] : ImpactedRegions) {
+ NewRescheduleRegions[I] = true;
+ DAG.MBBLiveIns.erase(DAG.Regions[I].first->getParent());
+
+ // Recompute maximum RP in regions in which at least one instruction was
+ // rematerialized from or to.
+ if (RecomputeRP) {
+ GCNDownwardRPTracker RPT(*LIS);
+ auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
+ DAG.Regions[I].second);
+ RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
+ RPT.advance(DAG.Regions[I].second);
+ DAG.Pressure[I] = RPT.moveMaxPressure();
}
- return false;
}
+ DAG.RescheduleRegions = NewRescheduleRegions;
- // Occupancy was improved for all regions.
- for (auto &Entry : InsertedMIToOldDef) {
- MachineInstr *MI = Entry.first;
- MachineInstr *OldMI = Entry.second;
-
- // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+ // Clean up the IR; remove rematerialized instructions from state.
+ for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
+ // Remove rematerialized instruction from BBLiveInMap since we are sinking
+ // it from its MBB.
DAG.BBLiveInMap.erase(OldMI);
- // Remove OldMI and update LIS
- Register Reg = MI->getOperand(0).getReg();
+ // Remove the rematerialized instruction and update LIS.
+ Register Reg = NewMI->getOperand(0).getReg();
LIS->RemoveMachineInstrFromMaps(*OldMI);
OldMI->eraseFromParent();
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
}
- // Update live-ins, register pressure, and regions caches.
- for (auto Idx : ImpactedRegions) {
- DAG.LiveIns[Idx] = NewLiveIns[Idx];
- DAG.Pressure[Idx] = NewPressure[Idx];
- DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
- }
- DAG.Regions = NewRegions;
- DAG.RescheduleRegions = NewRescheduleRegions;
-
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
-
- return true;
}
// Copied from MachineLICM
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 64d517038f90e0..c378564d5b2e97 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,7 +14,8 @@
#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
#include "GCNRegPressure.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -421,27 +422,42 @@ class ClusteredLowOccStage : public GCNSchedStage {
class PreRARematStage : public GCNSchedStage {
private:
- // Each region at MinOccupancy will have their own list of trivially
- // rematerializable instructions we can remat to reduce RP. The list maps an
- // instruction to the position we should remat before, usually the MI using
- // the rematerializable instruction.
- MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
- RematerializableInsts;
-
- // Map a trivially rematerializable def to a list of regions at MinOccupancy
- // that has the defined reg as a live-in.
- DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
-
- // Collect all trivially rematerializable VGPR instructions with a single def
- // and single use outside the defining block into RematerializableInsts.
- void collectRematerializableInstructions();
-
+ /// A trivially rematerializable VGPR-defining instruction along with
+ /// pre-computed information to help update the scheduler's status when we
+ /// rematerialize it.
+ struct RematInstruction {
+ /// Trivially rematerializable instruction.
+ MachineInstr *RematMI;
+ /// Region containing the rematerializable instruction.
+ unsigned DefRegion;
+ /// Single use of the rematerializable instruction's defined register,
+ /// located in a different block.
+ MachineInstr *UseMI;
+ /// Set of regions in which the rematerializable instruction's defined
+ /// register is a live-in.
+ SmallDenseSet<unsigned, 4> LiveInRegions;
+
+ RematInstruction(MachineInstr *RematMI, unsigned DefRegion,
+ MachineInstr *UseMI)
+ : RematMI(RematMI), DefRegion(DefRegion), UseMI(UseMI) {}
+ };
+
+ /// List of eligible rematerializable instructions to sink to increase
+ /// occupancy, in function instruction order.
+ std::vector<RematInstruction> RematInstructions;
+
+ /// Collect all trivially rematerializable VGPR instructions with a single def
+ /// and single use outside the defining block into RematerializableInsts.
+ bool collectRematerializableInstructions();
+
+ /// Whether the MI is trivially rematerializable and does not have eany
+ /// virtual register use.
bool isTriviallyReMaterializable(const MachineInstr &MI);
- // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
- // Attempt to reduce RP of VGPR by sinking trivially rematerializable
- // instructions. Returns true if we were able to sink instruction(s).
- bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ /// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+ /// Sinks all instructions in RematInstructions to increase function
+ /// occupancy. Modified regions are tagged for rescheduling.
+ void sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII);
public:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 51361b75940560..78f15ed8be8cae 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -367,6 +367,10 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
}
+unsigned GCNSubtarget::getNumVGPRsToIncreaseOccupancy(unsigned NumVGPRs) const {
+ return AMDGPU::IsaInfo::getNumVGPRsToIncreaseWavesPerEU(this, NumVGPRs);
+}
+
unsigned
GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ea5e159fdd8363..20e1ba350ed8e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1368,6 +1368,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+ /// Returns the necessary reduction in number of VGPRs from using \p VGPRs
+ /// VGPRs to increase occupancy by 1. Returns 0 when using \p VGPRs VGPRs
+ /// already results in maximum occupancy.
+ unsigned getNumVGPRsToIncreaseOccupancy(unsigned VGPRs) const;
+
/// Return occupancy for the given function. Used LDS and a number of
/// registers if provided.
/// Note, occupancy can be affected by the scratch allocation as well, but
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ab5f0694c07f95..a996cb21848643 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1185,6 +1185,19 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
}
+unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+ unsigned NumVGPRs) {
+ unsigned Granule = getVGPRAllocGranule(STI);
+ unsigned MaxWaves = getMaxWavesPerEU(STI);
+ unsigned TotalNumVGPRs = getTotalNumVGPRs(STI);
+
+ unsigned NumWaves =
+ getNumWavesPerEUWithNumVGPRs(NumVGPRs, Granule, MaxWaves, TotalNumVGPRs);
+ if (NumWaves == MaxWaves)
+ return 0;
+ return NumVGPRs - alignDown(TotalNumVGPRs / (NumWaves + 1), Granule);
+}
+
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
AMDGPUSubtarget::Generation Gen) {
if (Gen >= AMDGPUSubtarget::GFX10)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9f7fbec6a542f7..8b4136da9503cc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -324,6 +324,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs);
+/// Returns the necessary reduction in number of VGPRs from using \p VGPRs VGPRs
+/// to increase the achievable number of waves per EU for this subtarget by 1.
+/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of waves per EU.
+unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+ unsigned NumVGPRs);
+
/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 018da7f81e3d4b..abed0e080b7eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -3163,11 +3163,11 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+ ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
- ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3353,11 +3353,11 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+ ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
- ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -5913,4 +5913,141 @@ body: |
S_NOP 0, implicit %22
S_ENDPGM 0
...
+---
+name: test_live_through_occ_7_sink_for_8
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX908-LABEL: name: test_live_through_occ_7_sink_for_8
+ ; GFX908: bb.0:
+ ; GFX908-NEXT: successors: %bb.1(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.1:
+ ; GFX908-NEXT: successors: %bb.2(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: dead [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.2:
+ ; GFX908-NEXT: successors: %bb.3(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.3:
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_]]
+ ; GFX908-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+ %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+
+ bb.1:
+ successors: %bb.2
+ %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+ %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+ %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+ %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+ %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+ %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+ %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+ %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+ %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+ %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+ %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+ %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+ %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+ %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+ %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+
+ bb.2:
+ successors: %bb.3
+
+ S_NOP 0, implicit %2, implicit %18
+ S_NOP 0, implicit %3, implicit %19
+ S_NOP 0, implicit %4, implicit %20
+ S_NOP 0, implicit %5, implicit %21
+ S_NOP 0, implicit %6, implicit %22
+ S_NOP 0, implicit %7, implicit %23
+ S_NOP 0, implicit %8, implicit %24
+ S_NOP 0, implicit %9, implicit %25
+ S_NOP 0, implicit %10, implicit %26
+ S_NOP 0, implicit %11, implicit %27
+ S_NOP 0, implicit %12, implicit %28
+ S_NOP 0, implicit %13, implicit %29
+ S_NOP 0, implicit %14, implicit %30
+ S_NOP 0, implicit %15, implicit %31
+ S_NOP 0, implicit %16, implicit %32
+ S_NOP 0, implicit %33
+
+ bb.3:
+ S_NOP 0, implicit %0, implicit %1
+ S_ENDPGM 0
+...
>From a2e20f4db4d88db6a95198c628f8dcf69551c29d Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 4 Dec 2024 16:45:16 +0100
Subject: [PATCH 02/11] Edit stale comments and slightly rearch stage
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 35 +++++++++------------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 26 ++++++++-------
2 files changed, 30 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 1b39a5a7db7192..e43014efa08c5a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -960,9 +960,10 @@ bool PreRARematStage::initGCNSchedStage() {
// if there is another pass after this pass.
assert(!S.hasNextStage());
- if (!collectRematerializableInstructions())
+ std::vector<RematInstruction> RematInstructions;
+ if (!canIncreaseOccupancy(RematInstructions))
return false;
- sinkTriviallyRematInsts(ST, TII);
+ sinkTriviallyRematInsts(RematInstructions, ST, TII);
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with improved occupancy of "
@@ -1258,8 +1259,7 @@ GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
#ifndef NDEBUG
LLVM_DEBUG(
printScheduleModel(ReadyCyclesSorted);
- dbgs() << "\n\t"
- << "Metric: "
+ dbgs() << "\n\t" << "Metric: "
<< (SumBubbles
? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
: 1)
@@ -1294,8 +1294,7 @@ GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
#ifndef NDEBUG
LLVM_DEBUG(
printScheduleModel(ReadyCyclesSorted);
- dbgs() << "\n\t"
- << "Metric: "
+ dbgs() << "\n\t" << "Metric: "
<< (SumBubbles
? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
: 1)
@@ -1343,8 +1342,7 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
dbgs()
<< "\n\t *** In shouldRevertScheduling ***\n"
<< " *********** BEFORE UnclusteredHighRPStage ***********\n");
- ScheduleMetrics MBefore =
- getScheduleMetrics(DAG.SUnits);
+ ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
LLVM_DEBUG(
dbgs()
<< "\n *********** AFTER UnclusteredHighRPStage ***********\n");
@@ -1474,7 +1472,8 @@ void GCNSchedStage::revertScheduling() {
/// Allows to easily filter for this stage's debug output.
#define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
-bool PreRARematStage::collectRematerializableInstructions() {
+bool PreRARematStage::canIncreaseOccupancy(
+ std::vector<RematInstruction> &RematInstructions) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
@@ -1511,11 +1510,11 @@ bool PreRARematStage::collectRematerializableInstructions() {
if (NumRegs >= RPExcess) {
OptRegions.erase(I);
LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
- << "\n");
+ << "\n");
} else {
RPExcess -= NumRegs;
LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
- << " by " << NumRegs << " (" << RPExcess << " left)\n");
+ << " by " << NumRegs << " (" << RPExcess << " left)\n");
}
return OptRegions.empty();
};
@@ -1577,8 +1576,7 @@ bool PreRARematStage::collectRematerializableInstructions() {
if (It == DAG.LiveIns[LVRegion].end() || It->second.none())
continue;
Remat.LiveInRegions.insert(LVRegion);
- RA_DEBUG(dbgs() << " Def is live-in in region " << LVRegion
- << ": ");
+ RA_DEBUG(dbgs() << " Def is live-in in region " << LVRegion << ": ");
// Account for the reduction in RP due to the rematerialization in an
// optimizable region in which the defined register is a live-in. This
@@ -1610,8 +1608,9 @@ bool PreRARematStage::collectRematerializableInstructions() {
return false;
}
-void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
- const TargetInstrInfo *TII) {
+void PreRARematStage::sinkTriviallyRematInsts(
+ ArrayRef<RematInstruction> RematInstructions, const GCNSubtarget &ST,
+ const TargetInstrInfo *TII) {
// Collect regions whose live-ins or register pressure will change due to
// rematerialization, and map those whose maximum RP we need to fully
// recompute at the end to true.
@@ -1620,11 +1619,7 @@ void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
LiveIntervals *LIS = DAG.LIS;
- // TODO: In the spirit of rematerializing the minimum number of instructions
- // to increase occupancy, here we could sort the list of rematerializable
- // instructions in decreasing order of "expected profitability" so that we end
- // up moving as few instructions as possible in the loop below.
- for (RematInstruction &Remat : RematInstructions) {
+ for (const RematInstruction &Remat : RematInstructions) {
MachineInstr *DefMI = Remat.RematMI;
MachineBasicBlock::iterator InsertPos(Remat.UseMI);
Register Reg = DefMI->getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index c378564d5b2e97..e9ffea09955daf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -420,6 +420,12 @@ class ClusteredLowOccStage : public GCNSchedStage {
: GCNSchedStage(StageID, DAG) {}
};
+/// Attempts to increase function occupancy with respect to VGPR usage by one by
+/// sinking trivially rematerializable instructions to their use. When the stage
+/// estimates increasing occupancy is possible, as few instructions as possible
+/// are rematerialized to reduce potential negative effects on function latency.
+///
+/// TODO: We should extend this to work on SGPRs and AGPRs as well.
class PreRARematStage : public GCNSchedStage {
private:
/// A trivially rematerializable VGPR-defining instruction along with
@@ -442,22 +448,20 @@ class PreRARematStage : public GCNSchedStage {
: RematMI(RematMI), DefRegion(DefRegion), UseMI(UseMI) {}
};
- /// List of eligible rematerializable instructions to sink to increase
- /// occupancy, in function instruction order.
- std::vector<RematInstruction> RematInstructions;
+ /// Determines whether we can increase function occupancy by 1 through
+ /// rematerialization. If we can, returns true and fill \p RematInstructions
+ /// with a list of rematerializable instructions whose sinking would result in
+ /// increased occupancy; returns false otherwise.
+ bool canIncreaseOccupancy(std::vector<RematInstruction> &RematInstructions);
- /// Collect all trivially rematerializable VGPR instructions with a single def
- /// and single use outside the defining block into RematerializableInsts.
- bool collectRematerializableInstructions();
-
- /// Whether the MI is trivially rematerializable and does not have eany
+ /// Whether the MI is trivially rematerializable and does not have any
/// virtual register use.
bool isTriviallyReMaterializable(const MachineInstr &MI);
- /// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
- /// Sinks all instructions in RematInstructions to increase function
+ /// Sinks all instructions in \p RematInstructions to increase function
/// occupancy. Modified regions are tagged for rescheduling.
- void sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ void sinkTriviallyRematInsts(ArrayRef<RematInstruction> RematInstructions,
+ const GCNSubtarget &ST,
const TargetInstrInfo *TII);
public:
>From 0d9103463ae71faf19645dc0efb0337c0e972426 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 5 Dec 2024 01:15:48 +0100
Subject: [PATCH 03/11] Wrap comment correctly
---
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 8b4136da9503cc..783827f994c0e0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -326,7 +326,8 @@ unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
/// Returns the necessary reduction in number of VGPRs from using \p VGPRs VGPRs
/// to increase the achievable number of waves per EU for this subtarget by 1.
-/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of waves per EU.
+/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of
+/// waves per EU.
unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
unsigned NumVGPRs);
>From c095a15b92aa9318489b78ab7fa0cffaa544aaea Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 5 Dec 2024 01:27:12 +0100
Subject: [PATCH 04/11] Fix format
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 20e1ba350ed8e7..99e163a5661fe1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1372,7 +1372,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// VGPRs to increase occupancy by 1. Returns 0 when using \p VGPRs VGPRs
/// already results in maximum occupancy.
unsigned getNumVGPRsToIncreaseOccupancy(unsigned VGPRs) const;
-
+
/// Return occupancy for the given function. Used LDS and a number of
/// registers if provided.
/// Note, occupancy can be affected by the scratch allocation as well, but
>From e81f2adda81110da67a4b65e31d9ca21d9b515e3 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 6 Dec 2024 14:18:23 +0100
Subject: [PATCH 05/11] Address review comments
---
.../llvm/CodeGen/MachineRegisterInfo.h | 4 +
llvm/lib/CodeGen/MachineRegisterInfo.cpp | 5 +
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 88 ++++-----
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 16 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 +-
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 5 +-
.../machine-scheduler-sink-trivial-remats.mir | 183 ++++++++++++++++--
8 files changed, 236 insertions(+), 71 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 5dc51aaed81c7b..9bdae6aef134f0 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -23,6 +23,7 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/RegisterBank.h"
@@ -592,6 +593,9 @@ class MachineRegisterInfo {
/// multiple uses.
bool hasOneNonDBGUser(Register RegNo) const;
+ /// If the register has a single non-Debug instruction using the specified
+ /// register, returns it; otherwise returns nullptr.
+ MachineInstr* getOneNonDBGUser(Register RegNo) const;
/// hasAtMostUses - Return true if the given register has at most \p MaxUsers
/// non-debug user instructions.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index fcedb302d228c4..5739357d31c2c1 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -431,6 +431,11 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
return hasSingleElement(use_nodbg_instructions(RegNo));
}
+MachineInstr* MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
+ auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
+ return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
+}
+
bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
unsigned MaxUsers) const {
return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e43014efa08c5a..42d50e86345a70 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -960,10 +960,11 @@ bool PreRARematStage::initGCNSchedStage() {
// if there is another pass after this pass.
assert(!S.hasNextStage());
- std::vector<RematInstruction> RematInstructions;
+ SmallVector<RematInstruction> RematInstructions;
if (!canIncreaseOccupancy(RematInstructions))
return false;
- sinkTriviallyRematInsts(RematInstructions, ST, TII);
+ sinkTriviallyRematInsts(RematInstructions, ST,
+ static_cast<const SIInstrInfo *>(TII));
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with improved occupancy of "
@@ -1473,7 +1474,7 @@ void GCNSchedStage::revertScheduling() {
#define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
bool PreRARematStage::canIncreaseOccupancy(
- std::vector<RematInstruction> &RematInstructions) {
+ SmallVectorImpl<RematInstruction> &RematInstructions) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
@@ -1510,7 +1511,7 @@ bool PreRARematStage::canIncreaseOccupancy(
if (NumRegs >= RPExcess) {
OptRegions.erase(I);
LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
- << "\n");
+ << '\n');
} else {
RPExcess -= NumRegs;
LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
@@ -1526,34 +1527,27 @@ bool PreRARematStage::canIncreaseOccupancy(
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
auto Region = DAG.Regions[I];
for (auto MI = Region.first; MI != Region.second; ++MI) {
- // We only support instructions with at least one register (but
- // non-subregister) operand.
+ // The instruction must be trivially rematerializable.
MachineInstr &DefMI = *MI;
- if (DefMI.isBundle() || !DefMI.getNumOperands() ||
- !DefMI.getOperand(0).isReg() || DefMI.getOperand(0).getSubReg())
+ if (!isTriviallyReMaterializable(DefMI))
continue;
- // We only support rematerializing virtual VGPRs with one definition and
- // one use.
+ // We only support rematerializing virtual VGPRs with one definition.
Register Reg = DefMI.getOperand(0).getReg();
if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
- !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
- continue;
-
- // The instruction must be trivially rematerializable and have no virtual
- // register use.
- if (!isTriviallyReMaterializable(DefMI))
+ !DAG.MRI.hasOneDef(Reg))
continue;
- // We only care to rematerialize the instruction if its single use is in a
- // different block.
- MachineInstr *UseMI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
- if (DefMI.getParent() == UseMI->getParent())
+ // We only care to rematerialize the instruction if has a single non-debug
+ // user in a different block.
+ MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
+ if (!UseMI || DefMI.getParent() == UseMI->getParent())
continue;
- RA_DEBUG(dbgs() << "In region " << I << ", instruction " << DefMI
- << " is rematerializable with single use " << *UseMI);
+ RA_DEBUG(dbgs() << "In region " << I << ", rematerializable instruction "
+ << DefMI);
+ RA_DEBUG(dbgs() << "with single use " << *UseMI);
auto &Remat = RematInstructions.emplace_back(&DefMI, I, UseMI);
bool RematUseful = false;
@@ -1564,7 +1558,8 @@ bool PreRARematStage::canIncreaseOccupancy(
// instruction and the end of the region.
RA_DEBUG(dbgs() << " Instruction's defining region is optimizable: ");
RematUseful = true;
- auto RegMask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+ LaneBitmask RegMask =
+ DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
if (ReduceRPInRegion(It, I, RegMask))
return true;
}
@@ -1610,15 +1605,15 @@ bool PreRARematStage::canIncreaseOccupancy(
void PreRARematStage::sinkTriviallyRematInsts(
ArrayRef<RematInstruction> RematInstructions, const GCNSubtarget &ST,
- const TargetInstrInfo *TII) {
+ const SIInstrInfo *TII) {
// Collect regions whose live-ins or register pressure will change due to
// rematerialization, and map those whose maximum RP we need to fully
// recompute at the end to true.
SmallDenseMap<unsigned, bool> ImpactedRegions;
// Maps rematerialized instuctions to the one they were rematerialized from.
DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
- LiveIntervals *LIS = DAG.LIS;
+ // Rematerialize all instructions.
for (const RematInstruction &Remat : RematInstructions) {
MachineInstr *DefMI = Remat.RematMI;
MachineBasicBlock::iterator InsertPos(Remat.UseMI);
@@ -1631,9 +1626,8 @@ void PreRARematStage::sinkTriviallyRematInsts(
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
DefMI->getOperand(0).getSubReg(), *DefMI, *DAG.TRI);
MachineInstr *NewMI = &*std::prev(InsertPos);
- LIS->InsertMachineInstrInMaps(*NewMI);
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
+ NewMI->getOperand(0).setSubReg(DefMI->getOperand(0).getSubReg());
+ DAG.LIS->InsertMachineInstrInMaps(*NewMI);
InsertedMIToOldDef[NewMI] = DefMI;
// Update region boundaries in scheduling region we sinked from since we
@@ -1654,7 +1648,8 @@ void PreRARematStage::sinkTriviallyRematInsts(
// contains the single use. In live-through regions, maximum register
// pressure decreases predictably so we can directly update it. In the
// using region, maximum register pressure may or may not decrease, so we
- // will mark it for re-computation after all materializations.
+ // will mark it for re-computation after all materializations have taken
+ // place.
RegionLiveIns.erase(Reg);
if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) {
// Register is live-through and not used in this block.
@@ -1671,6 +1666,20 @@ void PreRARematStage::sinkTriviallyRematInsts(
ImpactedRegions[Remat.DefRegion] = true;
}
+ // Clean up the IR; remove rematerialized instructions from state.
+ for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
+ // Remove rematerialized instruction from BBLiveInMap since we are sinking
+ // it from its MBB.
+ DAG.BBLiveInMap.erase(OldMI);
+
+ // Remove the rematerialized instruction and update live intervals.
+ Register Reg = NewMI->getOperand(0).getReg();
+ DAG.LIS->RemoveMachineInstrFromMaps(*OldMI);
+ OldMI->eraseFromParent();
+ DAG.LIS->removeInterval(Reg);
+ DAG.LIS->createAndComputeVirtRegInterval(Reg);
+ }
+
// All regions impacted by at least one rematerialization must be rescheduled.
BitVector NewRescheduleRegions(DAG.Regions.size());
for (auto &[I, RecomputeRP] : ImpactedRegions) {
@@ -1680,7 +1689,7 @@ void PreRARematStage::sinkTriviallyRematInsts(
// Recompute maximum RP in regions in which at least one instruction was
// rematerialized from or to.
if (RecomputeRP) {
- GCNDownwardRPTracker RPT(*LIS);
+ GCNDownwardRPTracker RPT(*DAG.LIS);
auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
DAG.Regions[I].second);
RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
@@ -1690,20 +1699,6 @@ void PreRARematStage::sinkTriviallyRematInsts(
}
DAG.RescheduleRegions = NewRescheduleRegions;
- // Clean up the IR; remove rematerialized instructions from state.
- for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
- // Remove rematerialized instruction from BBLiveInMap since we are sinking
- // it from its MBB.
- DAG.BBLiveInMap.erase(OldMI);
-
- // Remove the rematerialized instruction and update LIS.
- Register Reg = NewMI->getOperand(0).getReg();
- LIS->RemoveMachineInstrFromMaps(*OldMI);
- OldMI->eraseFromParent();
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
- }
-
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
@@ -1716,6 +1711,11 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
if (!DAG.TII->isTriviallyReMaterializable(MI))
return false;
+ // Even though TargetInstrInfo::isReallyTriviallyReMaterializable already
+ // ensures that the instruction has no virtual register uses,
+ // SIInstrInfo::isReallyTriviallyReMaterializable may consider an instruction
+ // rematerializable and return before calling its parent's method, so we need
+ // to double-check here.
for (const MachineOperand &MO : MI.all_uses())
if (MO.getReg().isVirtual())
return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index e9ffea09955daf..9ed88f2b39cf0b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -434,35 +434,35 @@ class PreRARematStage : public GCNSchedStage {
struct RematInstruction {
/// Trivially rematerializable instruction.
MachineInstr *RematMI;
- /// Region containing the rematerializable instruction.
- unsigned DefRegion;
/// Single use of the rematerializable instruction's defined register,
/// located in a different block.
MachineInstr *UseMI;
/// Set of regions in which the rematerializable instruction's defined
/// register is a live-in.
SmallDenseSet<unsigned, 4> LiveInRegions;
+ /// Region containing the rematerializable instruction.
+ unsigned DefRegion;
RematInstruction(MachineInstr *RematMI, unsigned DefRegion,
MachineInstr *UseMI)
- : RematMI(RematMI), DefRegion(DefRegion), UseMI(UseMI) {}
+ : RematMI(RematMI), UseMI(UseMI), DefRegion(DefRegion) {}
};
/// Determines whether we can increase function occupancy by 1 through
/// rematerialization. If we can, returns true and fill \p RematInstructions
/// with a list of rematerializable instructions whose sinking would result in
/// increased occupancy; returns false otherwise.
- bool canIncreaseOccupancy(std::vector<RematInstruction> &RematInstructions);
+ bool
+ canIncreaseOccupancy(SmallVectorImpl<RematInstruction> &RematInstructions);
- /// Whether the MI is trivially rematerializable and does not have any
- /// virtual register use.
+ /// Whether the MI is trivially rematerializable and does not have any virtual
+ /// register use.
bool isTriviallyReMaterializable(const MachineInstr &MI);
/// Sinks all instructions in \p RematInstructions to increase function
/// occupancy. Modified regions are tagged for rescheduling.
void sinkTriviallyRematInsts(ArrayRef<RematInstruction> RematInstructions,
- const GCNSubtarget &ST,
- const TargetInstrInfo *TII);
+ const GCNSubtarget &ST, const SIInstrInfo *TII);
public:
bool initGCNSchedStage() override;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 78f15ed8be8cae..06ad445f865ff8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -368,7 +368,7 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
}
unsigned GCNSubtarget::getNumVGPRsToIncreaseOccupancy(unsigned NumVGPRs) const {
- return AMDGPU::IsaInfo::getNumVGPRsToIncreaseWavesPerEU(this, NumVGPRs);
+ return AMDGPU::IsaInfo::getVGPRReductionToIncreaseWavesPerEU(this, NumVGPRs);
}
unsigned
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a996cb21848643..67332b25a45e08 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1185,8 +1185,8 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
}
-unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
- unsigned NumVGPRs) {
+unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+ unsigned NumVGPRs) {
unsigned Granule = getVGPRAllocGranule(STI);
unsigned MaxWaves = getMaxWavesPerEU(STI);
unsigned TotalNumVGPRs = getTotalNumVGPRs(STI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 783827f994c0e0..375a02f242e33b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -328,8 +328,9 @@ unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
/// to increase the achievable number of waves per EU for this subtarget by 1.
/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of
/// waves per EU.
-unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
- unsigned NumVGPRs);
+
+unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+ unsigned NumVGPRs);
/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index abed0e080b7eeb..0a9978dea0461a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -5585,12 +5585,13 @@ body: |
S_ENDPGM 0
...
---
-name: test_occ_9_no_sink_one_def_of_undef_subreg
+---
+name: test_occ_7_sink_one_def_of_undef_subreg_for_8
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
- ; GFX908-LABEL: name: test_occ_9_no_sink_one_def_of_undef_subreg
+ ; GFX908-LABEL: name: test_occ_7_sink_one_def_of_undef_subreg_for_8
; GFX908: bb.0:
; GFX908-NEXT: successors: %bb.1(0x80000000)
; GFX908-NEXT: {{ $}}
@@ -5617,16 +5618,22 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x80000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.2:
- ; GFX908-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
@@ -5638,7 +5645,13 @@ body: |
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+ ; GFX908-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
; GFX908-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
@@ -5666,19 +5679,24 @@ body: |
%20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
%21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
%22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ undef %32.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
- %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- S_NOP 0, implicit %24
+ S_NOP 0, implicit %0
bb.2:
- ; predcessors: %bb.1
- S_NOP 0, implicit %23.sub1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
S_NOP 0, implicit %4, implicit %5
@@ -5690,7 +5708,12 @@ body: |
S_NOP 0, implicit %16, implicit %17
S_NOP 0, implicit %18, implicit %19
S_NOP 0, implicit %20, implicit %21
- S_NOP 0, implicit %22
+ S_NOP 0, implicit %22, implicit %23
+ S_NOP 0, implicit %24, implicit %25
+ S_NOP 0, implicit %26, implicit %27
+ S_NOP 0, implicit %28, implicit %29
+ S_NOP 0, implicit %30, implicit %31
+ S_NOP 0, implicit %32.sub1
S_ENDPGM 0
...
---
@@ -6051,3 +6074,135 @@ body: |
S_NOP 0, implicit %0, implicit %1
S_ENDPGM 0
...
+---
+name: test_remat_over_exec_modif
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX908-LABEL: name: test_remat_over_exec_modif
+ ; GFX908: bb.0:
+ ; GFX908-NEXT: successors: %bb.1(0x80000000)
+ ; GFX908-NEXT: liveins: $sgpr2_sgpr3
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: %save_exec:sreg_64 = S_MOV_B64 $exec
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: $exec = S_MOV_B64 %new_exec
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.1:
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+ ; GFX908-NEXT: $exec = S_MOV_B64 %save_exec
+ ; GFX908-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $sgpr2_sgpr3
+
+ %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+ %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+ %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+ %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+ %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+ %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+ %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+ %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+ %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+ %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+ %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+ %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+ %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+ %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+ %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+ %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+
+ %save_exec:sreg_64 = S_MOV_B64 $exec
+ %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+ $exec = S_MOV_B64 %new_exec
+
+ bb.1:
+
+ S_NOP 0, implicit %0, implicit %16
+ S_NOP 0, implicit %1, implicit %17
+ S_NOP 0, implicit %2, implicit %18
+ S_NOP 0, implicit %3, implicit %19
+ S_NOP 0, implicit %4, implicit %20
+ S_NOP 0, implicit %5, implicit %21
+ S_NOP 0, implicit %6, implicit %22
+ S_NOP 0, implicit %7, implicit %23
+ S_NOP 0, implicit %8, implicit %24
+ S_NOP 0, implicit %9, implicit %25
+ S_NOP 0, implicit %10, implicit %26
+ S_NOP 0, implicit %11, implicit %27
+ S_NOP 0, implicit %12, implicit %28
+ S_NOP 0, implicit %13, implicit %29
+ S_NOP 0, implicit %14, implicit %30
+ S_NOP 0, implicit %15, implicit %31
+ S_NOP 0, implicit %32
+
+ $exec = S_MOV_B64 %save_exec
+ S_ENDPGM 0
+...
>From 1ceec9d26ff33bd2139d53aab433405ca23c8a56 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 6 Dec 2024 14:22:52 +0100
Subject: [PATCH 06/11] Remove useless closure argument
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 42d50e86345a70..57497ba002c9d4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -306,11 +306,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
HasHighPressure = true;
if (SGPRDelta > VGPRDelta) {
Cand.RPDelta.CriticalMax =
- PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
+ PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
} else {
Cand.RPDelta.CriticalMax =
- PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
+ PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
}
}
@@ -419,7 +419,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
/*IsBottomUp=*/false);
assert(TCand.SU == TopCand.SU &&
- "Last pick result should correspond to re-picking right now");
+ "Last pick result should correspond to re-picking right now");
}
#endif
}
@@ -1502,11 +1502,12 @@ bool PreRARematStage::canIncreaseOccupancy(
if (OptRegions.empty())
return false;
- // Tracks estimated rematerialization gains (i.e., reduction in RP) for
- // this instruction in each optimizable region.
- auto ReduceRPInRegion = [&](auto OptIt, unsigned I,
- LaneBitmask Mask) -> bool {
+ // Accounts for a reduction in RP in an optimizable region. Returns whether we
+ // estimate that we have identified enough rematerialization opportunities to
+ // increase function occupancy.
+ auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+ unsigned I = OptIt->getFirst();
unsigned &RPExcess = OptIt->getSecond();
if (NumRegs >= RPExcess) {
OptRegions.erase(I);
@@ -1560,7 +1561,7 @@ bool PreRARematStage::canIncreaseOccupancy(
RematUseful = true;
LaneBitmask RegMask =
DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
- if (ReduceRPInRegion(It, I, RegMask))
+ if (ReduceRPInRegion(It, RegMask))
return true;
}
@@ -1581,7 +1582,7 @@ bool PreRARematStage::canIncreaseOccupancy(
// instruction's use.
if (auto It = OptRegions.find(LVRegion); It != OptRegions.end()) {
RematUseful = true;
- if (ReduceRPInRegion(It, LVRegion, DAG.LiveIns[LVRegion][Reg]))
+ if (ReduceRPInRegion(It, DAG.LiveIns[LVRegion][Reg]))
return true;
} else {
LLVM_DEBUG(dbgs() << "unoptimizable region\n");
>From ce4e1ad9e0412167fe31892bf867000785b76556 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 6 Dec 2024 14:33:53 +0100
Subject: [PATCH 07/11] Fix format
---
llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 4 ++--
llvm/lib/CodeGen/MachineRegisterInfo.cpp | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 9bdae6aef134f0..23e4b30696e06d 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -594,8 +594,8 @@ class MachineRegisterInfo {
bool hasOneNonDBGUser(Register RegNo) const;
/// If the register has a single non-Debug instruction using the specified
- /// register, returns it; otherwise returns nullptr.
- MachineInstr* getOneNonDBGUser(Register RegNo) const;
+ /// register, returns it; otherwise returns nullptr.
+ MachineInstr *getOneNonDBGUser(Register RegNo) const;
/// hasAtMostUses - Return true if the given register has at most \p MaxUsers
/// non-debug user instructions.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 5739357d31c2c1..5c29e902849aa5 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -431,7 +431,7 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
return hasSingleElement(use_nodbg_instructions(RegNo));
}
-MachineInstr* MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
+MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
}
>From b4c8af98b71b4bd9c9d212aaffc13cf4f4e42127 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Mon, 9 Dec 2024 16:12:01 +0100
Subject: [PATCH 08/11] Update live interval manually instead of fully
recomputing
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 40 ++++++++++++++++-----
1 file changed, 32 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57497ba002c9d4..2d193b3a2abf64 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1611,8 +1611,8 @@ void PreRARematStage::sinkTriviallyRematInsts(
// rematerialization, and map those whose maximum RP we need to fully
// recompute at the end to true.
SmallDenseMap<unsigned, bool> ImpactedRegions;
- // Maps rematerialized instuctions to the one they were rematerialized from.
- DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+ // Collect new MIs, in rematerialization order.
+ SmallVector<MachineInstr *> NewMIs;
// Rematerialize all instructions.
for (const RematInstruction &Remat : RematInstructions) {
@@ -1629,7 +1629,7 @@ void PreRARematStage::sinkTriviallyRematInsts(
MachineInstr *NewMI = &*std::prev(InsertPos);
NewMI->getOperand(0).setSubReg(DefMI->getOperand(0).getSubReg());
DAG.LIS->InsertMachineInstrInMaps(*NewMI);
- InsertedMIToOldDef[NewMI] = DefMI;
+ NewMIs.push_back(NewMI);
// Update region boundaries in scheduling region we sinked from since we
// may sink an instruction that was at the beginning or end of its region.
@@ -1668,15 +1668,39 @@ void PreRARematStage::sinkTriviallyRematInsts(
}
// Clean up the IR; remove rematerialized instructions from state.
- for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
+ for (auto [Remat, NewMI] : llvm::zip_equal(RematInstructions, NewMIs)) {
// Remove rematerialized instruction from BBLiveInMap since we are sinking
- // it from its MBB.
+ // it from its MBB. Also remove it from live intervals and from its MBB.
+ MachineInstr *OldMI = Remat.RematMI;
DAG.BBLiveInMap.erase(OldMI);
-
- // Remove the rematerialized instruction and update live intervals.
- Register Reg = NewMI->getOperand(0).getReg();
DAG.LIS->RemoveMachineInstrFromMaps(*OldMI);
OldMI->eraseFromParent();
+
+ // Update the register's live interval manually. This is aligned with the
+ // instruction collection phase in that it assumes a single def and use
+ // (possibly subreg).
+ SlotIndexes *Slots = DAG.LIS->getSlotIndexes();
+ SlotIndex NewDef = Slots->getInstructionIndex(*NewMI).getRegSlot();
+ SlotIndex NewKill = Slots->getInstructionIndex(*Remat.UseMI).getRegSlot();
+
+ auto UpdateSegment = [&](LiveRange::Segments &Segments) -> void {
+ assert(Segments.size() == 1 && "expected single segment");
+ LiveRange::Segment &Seg = Segments.front();
+ Seg.start = NewDef;
+ Seg.end = NewKill;
+ if (Seg.valno)
+ Seg.valno->def = NewDef;
+ };
+
+ Register Reg = NewMI->getOperand(0).getReg();
+ LiveInterval &RegLI = DAG.LIS->getInterval(Reg);
+ UpdateSegment(RegLI.segments);
+ if (RegLI.hasSubRanges()) {
+ LiveInterval::SubRange &SubRange = *RegLI.subrange_begin();
+ assert(!SubRange.Next && "expected at most one subrange");
+ UpdateSegment(SubRange.segments);
+ }
+
DAG.LIS->removeInterval(Reg);
DAG.LIS->createAndComputeVirtRegInterval(Reg);
}
>From 928699e7b9ca1a457bba10d7e7fc5eba96e9a865 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Mon, 9 Dec 2024 16:15:35 +0100
Subject: [PATCH 09/11] Remove spurious live interval recomputing
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 2d193b3a2abf64..070fbba5aed793 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1700,9 +1700,6 @@ void PreRARematStage::sinkTriviallyRematInsts(
assert(!SubRange.Next && "expected at most one subrange");
UpdateSegment(SubRange.segments);
}
-
- DAG.LIS->removeInterval(Reg);
- DAG.LIS->createAndComputeVirtRegInterval(Reg);
}
// All regions impacted by at least one rematerialization must be rescheduled.
>From f47638e427e9eeddefa314f6bb9a0bd637d3c8b9 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 11 Dec 2024 22:48:53 +0100
Subject: [PATCH 10/11] Early return on SGPR-limited region
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 070fbba5aed793..7496ffa62e1416 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1490,8 +1490,11 @@ bool PreRARematStage::canIncreaseOccupancy(
// We do not rematerialize SGPR-defining regions yet so do not bother
// optimizing regions whose occupancy is SGPR-limited.
- if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy)
- continue;
+ if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy) {
+ RA_DEBUG(dbgs() << "Region " << I
+ << "'s occupancy is SGPR-limited, aborting\n");
+ return false;
+ }
unsigned NumVGPRs = RP.getVGPRNum(ST.hasGFX90AInsts());
unsigned NumToIncreaseOcc = ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs);
>From c0b4f825019ad4c4c589d51d88bb5c4616510a41 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Tue, 7 Jan 2025 12:19:14 +0100
Subject: [PATCH 11/11] Incorporated feedback and fixed pre-existing bugs
- The stage now tries to remove any VGPR spilling before trying to
increase occupancy. In all cases it remats just enough instructions to
reduce/eliminate spilling and/or increase occupancy.
- Stop updating MBBLiveIns and BBLiveInMap in the stage, since these are
not accessed.
- Fixed region boundary update logic, it did not work when a region
became empty due to remating all of its MIs.
- Fixed issue in getRegionLiveMap wherein the BB pointer was not updated
correctly.
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 268 +++++---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 22 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 9 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 7 +-
.../machine-scheduler-sink-trivial-remats.mir | 599 ++++++++++++++++++
7 files changed, 811 insertions(+), 103 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 7496ffa62e1416..358d63d1872c79 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -754,13 +754,13 @@ GCNScheduleDAGMILive::getRegionLiveInMap() const {
std::vector<MachineInstr *> RegionFirstMIs;
RegionFirstMIs.reserve(Regions.size());
auto I = Regions.rbegin(), E = Regions.rend();
- auto *BB = I->first->getParent();
do {
+ const MachineBasicBlock *MBB = I->first->getParent();
auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
RegionFirstMIs.push_back(MI);
do {
++I;
- } while (I != E && I->first->getParent() == BB);
+ } while (I != E && I->first->getParent() == MBB);
} while (I != E);
return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
}
@@ -955,13 +955,13 @@ bool PreRARematStage::initGCNSchedStage() {
DAG.MinOccupancy)
return false;
- // FIXME: This pass will invalidate cached MBBLiveIns for regions
- // inbetween the defs and region we sinked the def to. Will need to be fixed
- // if there is another pass after this pass.
+ // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
+ // regions inbetween the defs and region we sinked the def to. Will need to be
+ // fixed if there is another pass after this pass.
assert(!S.hasNextStage());
SmallVector<RematInstruction> RematInstructions;
- if (!canIncreaseOccupancy(RematInstructions))
+ if (!canIncreaseOccupancyOrReduceSpill(RematInstructions))
return false;
sinkTriviallyRematInsts(RematInstructions, ST,
static_cast<const SIInstrInfo *>(TII));
@@ -1473,53 +1473,105 @@ void GCNSchedStage::revertScheduling() {
/// Allows to easily filter for this stage's debug output.
#define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
-bool PreRARematStage::canIncreaseOccupancy(
+bool PreRARematStage::canIncreaseOccupancyOrReduceSpill(
SmallVectorImpl<RematInstruction> &RematInstructions) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
- RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
+ RA_DEBUG(dbgs() << "Collecting rematerializable instructions in "
+ << MF.getFunction().getName() << "\n");
+ // Models excess register pressure in a region.
+ struct ExcessRP {
+ // Amount of VGPR spilling, if any.
+ unsigned Spilling = 0;
+ // Number of VGPRs to save to increase occupancy. std::nullopt when
+ // occupancy is SGPR-limited.
+ std::optional<unsigned> Occupancy = std::nullopt;
+ };
// Maps optimizable regions (i.e., regions at minimum and VGPR-limited
- // occupancy) to the numbers of VGPRs that must be deducted from their maximum
- // VGPR pressure for their occupancy to be increased by one.
- DenseMap<unsigned, unsigned> OptRegions;
+ // occupancy, or regions with VGPR spilling) to their excess RP.
+ DenseMap<unsigned, ExcessRP> OptRegions;
+
+ // Collect optimizable regions.
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+ unsigned NumRegionsSpilling = 0;
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
if (!DAG.RegionsWithMinOcc[I])
continue;
GCNRegPressure &RP = DAG.Pressure[I];
- // We do not rematerialize SGPR-defining regions yet so do not bother
- // optimizing regions whose occupancy is SGPR-limited.
- if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy) {
- RA_DEBUG(dbgs() << "Region " << I
- << "'s occupancy is SGPR-limited, aborting\n");
- return false;
- }
-
unsigned NumVGPRs = RP.getVGPRNum(ST.hasGFX90AInsts());
- unsigned NumToIncreaseOcc = ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs);
- OptRegions.insert({I, NumToIncreaseOcc});
- RA_DEBUG(dbgs() << "Region " << I << " has min. occupancy: decrease by "
- << NumToIncreaseOcc << " VGPR(s) to improve occupancy\n");
+ ExcessRP Excess;
+ if (NumVGPRs > MaxVGPRs) {
+ // Region has VGPR spilling, we may not be able to increase occupancy but
+ // we can at least try to reduce spilling as much as possible.
+ ++NumRegionsSpilling;
+ Excess.Spilling = ST.getNumVGPRsToEliminateSpilling(NumVGPRs);
+ RA_DEBUG(dbgs() << "Region " << I << " is spilling VGPRs, save "
+ << Excess.Spilling << " VGPR(s) to eliminate spilling\n");
+ }
+ if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) != DAG.MinOccupancy) {
+ // Occupancy is VGPR-limited. If occupancy is minimal and there is
+ // spilling, the number of registers to save to increase occupancy
+ // includes the number of spilled registers to save; deduct the latter
+ // from the former to only get the number of registers to save to
+ // increase occupancy as if there was no spilling.
+ Excess.Occupancy =
+ ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs) - Excess.Spilling;
+ RA_DEBUG(dbgs() << "Region " << I << " has min. occupancy: save "
+ << Excess.Occupancy << " VGPR(s) to improve occupancy\n");
+ }
+ if (Excess.Spilling || Excess.Occupancy)
+ OptRegions.insert({I, Excess});
}
if (OptRegions.empty())
return false;
+ bool FuncHasSpilling = NumRegionsSpilling != 0;
+ unsigned RematSpillingCutoff = 0;
+
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
// increase function occupancy.
auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
unsigned I = OptIt->getFirst();
- unsigned &RPExcess = OptIt->getSecond();
- if (NumRegs >= RPExcess) {
- OptRegions.erase(I);
- LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
- << '\n');
- } else {
- RPExcess -= NumRegs;
- LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
- << " by " << NumRegs << " (" << RPExcess << " left)\n");
+ ExcessRP &Excess = OptIt->getSecond();
+
+ // While there is spilling, saved registers only serve to reduce it.
+ if (Excess.Spilling) {
+ unsigned Reduction = std::min(NumRegs, Excess.Spilling);
+ Excess.Spilling -= Reduction;
+ NumRegs -= Reduction;
+ if (!Excess.Spilling) {
+ // We have eliminated spilling in the region.
+ LLVM_DEBUG(dbgs() << "sinking eliminates spilling in region " << I
+ << '\n');
+ if (--NumRegionsSpilling)
+ RematSpillingCutoff = RematInstructions.size();
+ if (!Excess.Occupancy) {
+ // Occupancy cannot be increased, so we are done with this region.
+ OptRegions.erase(I);
+ return OptRegions.empty();
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << "sinking reduces spilling in region " << I
+ << " by " << Reduction << '\n');
+ }
+ }
+
+ // Once spilling has been eliminated, saved registers serve to increase
+ // occupancy.
+ if (NumRegs && Excess.Occupancy) {
+ if (NumRegs >= *Excess.Occupancy) {
+ OptRegions.erase(I);
+ LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
+ << '\n');
+ } else {
+ *Excess.Occupancy -= NumRegs;
+ LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
+ << " by " << NumRegs << '\n');
+ }
}
return OptRegions.empty();
};
@@ -1543,8 +1595,8 @@ bool PreRARematStage::canIncreaseOccupancy(
!DAG.MRI.hasOneDef(Reg))
continue;
- // We only care to rematerialize the instruction if has a single non-debug
- // user in a different block.
+ // We only care to rematerialize the instruction if it has a single
+ // non-debug user in a different block.
MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
if (!UseMI || DefMI.getParent() == UseMI->getParent())
continue;
@@ -1568,14 +1620,14 @@ bool PreRARematStage::canIncreaseOccupancy(
return true;
}
- for (unsigned LVRegion = 0; LVRegion != E; ++LVRegion) {
+ for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
// We are only collecting regions in which the register is a live-in
// (and may be live-through).
- auto It = DAG.LiveIns[LVRegion].find(Reg);
- if (It == DAG.LiveIns[LVRegion].end() || It->second.none())
+ auto It = DAG.LiveIns[LIRegion].find(Reg);
+ if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
continue;
- Remat.LiveInRegions.insert(LVRegion);
- RA_DEBUG(dbgs() << " Def is live-in in region " << LVRegion << ": ");
+ Remat.LiveInRegions.insert(LIRegion);
+ RA_DEBUG(dbgs() << " Def is live-in in region " << LIRegion << ": ");
// Account for the reduction in RP due to the rematerialization in an
// optimizable region in which the defined register is a live-in. This
@@ -1583,9 +1635,9 @@ bool PreRARematStage::canIncreaseOccupancy(
// where RP is actually reduced only if maximum RP is reached somewhere
// between the beginning of the region and the rematerializable
// instruction's use.
- if (auto It = OptRegions.find(LVRegion); It != OptRegions.end()) {
+ if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
RematUseful = true;
- if (ReduceRPInRegion(It, DAG.LiveIns[LVRegion][Reg]))
+ if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg]))
return true;
} else {
LLVM_DEBUG(dbgs() << "unoptimizable region\n");
@@ -1603,6 +1655,17 @@ bool PreRARematStage::canIncreaseOccupancy(
}
}
+ if (FuncHasSpilling && !RematInstructions.empty()) {
+ // We won't be able to increase occupancy, but we still want to try to
+ // reduce spilling. Drop any instruction we collected to increase occupancy.
+ if (!NumRegionsSpilling) {
+ RematInstructions.truncate(RematSpillingCutoff);
+ RA_DEBUG(dbgs() << "Can only eliminate spilling\n");
+ } else {
+ RA_DEBUG(dbgs() << "Can only reduce spilling\n");
+ }
+ return true;
+ }
RA_DEBUG(dbgs() << "Cannot increase occupancy through rematerialization\n");
return false;
}
@@ -1634,12 +1697,9 @@ void PreRARematStage::sinkTriviallyRematInsts(
DAG.LIS->InsertMachineInstrInMaps(*NewMI);
NewMIs.push_back(NewMI);
- // Update region boundaries in scheduling region we sinked from since we
- // may sink an instruction that was at the beginning or end of its region.
- DAG.updateRegionBoundaries(DAG.Regions, DefMI, /*NewMI =*/nullptr,
- /*Removing =*/true);
-
- // Update region boundaries in region we sinked to.
+ // Update region boundaries in regions we sinked from (remove defining MI)
+ // and to (insert MI rematerialized in use block).
+ DAG.updateRegionBoundaries(DAG.Regions, DefMI, nullptr);
DAG.updateRegionBoundaries(DAG.Regions, InsertPos, NewMI);
// Collect all regions impacted by the rematerialization and update their
@@ -1670,50 +1730,68 @@ void PreRARematStage::sinkTriviallyRematInsts(
ImpactedRegions[Remat.DefRegion] = true;
}
- // Clean up the IR; remove rematerialized instructions from state.
+ // Remove rematerialized instructions from state and update LIS.
+ SlotIndexes *Slots = DAG.LIS->getSlotIndexes();
for (auto [Remat, NewMI] : llvm::zip_equal(RematInstructions, NewMIs)) {
- // Remove rematerialized instruction from BBLiveInMap since we are sinking
- // it from its MBB. Also remove it from live intervals and from its MBB.
+ // Remove rematerialized instruction from live intervals and from its MBB.
MachineInstr *OldMI = Remat.RematMI;
- DAG.BBLiveInMap.erase(OldMI);
DAG.LIS->RemoveMachineInstrFromMaps(*OldMI);
OldMI->eraseFromParent();
// Update the register's live interval manually. This is aligned with the
// instruction collection phase in that it assumes a single def and use
// (possibly subreg).
- SlotIndexes *Slots = DAG.LIS->getSlotIndexes();
SlotIndex NewDef = Slots->getInstructionIndex(*NewMI).getRegSlot();
SlotIndex NewKill = Slots->getInstructionIndex(*Remat.UseMI).getRegSlot();
- auto UpdateSegment = [&](LiveRange::Segments &Segments) -> void {
- assert(Segments.size() == 1 && "expected single segment");
- LiveRange::Segment &Seg = Segments.front();
+ // Update the live range to reflect the register's rematerialization. We
+ // will have a single segment (we only remat regs with one use) and at most
+ // a single value number (we only remat virtual regs i.e., no redefinition).
+ auto UpdateLiveRange = [&](LiveRange &LR) -> void {
+ assert(LR.segments.size() && "expected at least one segment");
+ assert(LR.valnos.size() < 2 && "expected at most one value number");
+
+ // Fix up the first segment and remove the others which have become
+ // unnecessary by construction.
+ LR.segments.truncate(1);
+ LiveRange::Segment &Seg = LR.segments.front();
Seg.start = NewDef;
Seg.end = NewKill;
- if (Seg.valno)
+ LR.valnos.clear();
+ if (Seg.valno) {
+ // If present, update the segment's value number as well.
Seg.valno->def = NewDef;
+ LR.valnos.push_back(Seg.valno);
+ }
};
Register Reg = NewMI->getOperand(0).getReg();
LiveInterval &RegLI = DAG.LIS->getInterval(Reg);
- UpdateSegment(RegLI.segments);
+ UpdateLiveRange(RegLI);
if (RegLI.hasSubRanges()) {
LiveInterval::SubRange &SubRange = *RegLI.subrange_begin();
assert(!SubRange.Next && "expected at most one subrange");
- UpdateSegment(SubRange.segments);
+ UpdateLiveRange(SubRange);
}
}
// All regions impacted by at least one rematerialization must be rescheduled.
- BitVector NewRescheduleRegions(DAG.Regions.size());
+ // Maximum pressure must also be recomputed for all regions where it changed
+ // non-predictably.
for (auto &[I, RecomputeRP] : ImpactedRegions) {
- NewRescheduleRegions[I] = true;
- DAG.MBBLiveIns.erase(DAG.Regions[I].first->getParent());
+ bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
+ DAG.RescheduleRegions[I] = !IsEmptyRegion;
+ if (!RecomputeRP)
+ continue;
- // Recompute maximum RP in regions in which at least one instruction was
- // rematerialized from or to.
- if (RecomputeRP) {
+ if (IsEmptyRegion) {
+ // The maximum RP in an empty region is simply the pressure induced by its
+ // live-ins.
+ GCNRegPressure RP;
+ for (auto [Reg, LaneMask] : DAG.LiveIns[I])
+ RP.inc(Reg, LaneBitmask::getNone(), LaneMask, DAG.MRI);
+ DAG.Pressure[I] = RP;
+ } else {
GCNDownwardRPTracker RPT(*DAG.LIS);
auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
DAG.Regions[I].second);
@@ -1722,7 +1800,6 @@ void PreRARematStage::sinkTriviallyRematInsts(
DAG.Pressure[I] = RPT.moveMaxPressure();
}
}
- DAG.RescheduleRegions = NewRescheduleRegions;
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
@@ -1748,45 +1825,50 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
return true;
}
-// When removing, we will have to check both beginning and ending of the region.
-// When inserting, we will only have to check if we are inserting NewMI in front
-// of a scheduling region and do not need to check the ending since we will only
-// ever be inserting before an already existing MI.
void GCNScheduleDAGMILive::updateRegionBoundaries(
SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>> &RegionBoundaries,
- MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
+ MachineBasicBlock::iterator MI, MachineInstr *NewMI) {
unsigned I = 0, E = RegionBoundaries.size();
- // Search for first region of the block where MI is located
- while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
+ // Search for first region of the block where MI is located. We may encounter
+ // an empty region if all instructions from an initially non-empty region were
+ // removed.
+ while (I != E && RegionBoundaries[I].first != RegionBoundaries[I].second &&
+ MI->getParent() != RegionBoundaries[I].first->getParent())
++I;
for (; I != E; ++I) {
- if (MI->getParent() != RegionBoundaries[I].first->getParent())
+ auto &Bounds = RegionBoundaries[I];
+ assert(MI != Bounds.second && "cannot insert at region end");
+ assert(!NewMI || NewMI != Bounds.second && "cannot remove at region end");
+
+ // We may encounter an empty region if all of the region' instructions were
+ // removed.
+ if (Bounds.first == Bounds.second) {
+ if (MI->getParent()->end() != Bounds.second)
+ return;
+ continue;
+ }
+ if (MI->getParent() != Bounds.first->getParent())
return;
- if (Removing && MI == RegionBoundaries[I].first &&
- MI == RegionBoundaries[I].second) {
- // MI is in a region with size 1, after removing, the region will be
- // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
- RegionBoundaries[I] =
- std::pair(MI->getParent()->end(), MI->getParent()->end());
- return;
- }
- if (MI == RegionBoundaries[I].first) {
- if (Removing)
- RegionBoundaries[I] =
- std::pair(std::next(MI), RegionBoundaries[I].second);
+ // We only care for modifications at the beginning of the region since the
+ // upper region boundary is exclusive.
+ if (MI != Bounds.first)
+ continue;
+ if (!NewMI) {
+ // This is an MI removal, which may leave the region empty; in such cases
+ // set both boundaries to the removed instruction's MBB's end.
+ MachineBasicBlock::iterator NextMI = std::next(MI);
+ if (NextMI != Bounds.second)
+ Bounds.first = NextMI;
else
- // Inserted NewMI in front of region, set new RegionBegin to NewMI
- RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
- RegionBoundaries[I].second);
- return;
- }
- if (Removing && MI == RegionBoundaries[I].second) {
- RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
- return;
+ Bounds.first = Bounds.second = MI->getParent()->end();
+ } else {
+ // This is an MI insertion at the beggining of the region.
+ Bounds.first = NewMI;
}
+ return;
}
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 9ed88f2b39cf0b..37143bf961238b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -275,12 +275,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Compute and cache live-ins and pressure for all regions in block.
void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
- // Update region boundaries when removing MI or inserting NewMI before MI.
+ /// If necessary, updates a region's boundaries following insertion ( \p NewMI
+ /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
+ /// For an MI removal, this must be called before the MI is actually erased
+ /// from its parent MBB. If a region is left empty by a removal, both
+ /// boundaries are set to the last removed MI's MBB's end.
void updateRegionBoundaries(
SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>> &RegionBoundaries,
- MachineBasicBlock::iterator MI, MachineInstr *NewMI,
- bool Removing = false);
+ MachineBasicBlock::iterator MI, MachineInstr *NewMI);
void runSchedStages();
@@ -448,12 +451,13 @@ class PreRARematStage : public GCNSchedStage {
: RematMI(RematMI), UseMI(UseMI), DefRegion(DefRegion) {}
};
- /// Determines whether we can increase function occupancy by 1 through
- /// rematerialization. If we can, returns true and fill \p RematInstructions
- /// with a list of rematerializable instructions whose sinking would result in
- /// increased occupancy; returns false otherwise.
- bool
- canIncreaseOccupancy(SmallVectorImpl<RematInstruction> &RematInstructions);
+ /// Determines whether we can increase function occupancy by 1 or
+ /// reduce/eliminate spilling through rematerialization. If we can, returns
+ /// true and fill \p RematInstructions with a list of rematerializable
+ /// instructions whose sinking would result in increased occupancy and/or
+ /// reduced spilling; returns false otherwise.
+ bool canIncreaseOccupancyOrReduceSpill(
+ SmallVectorImpl<RematInstruction> &RematInstructions);
/// Whether the MI is trivially rematerializable and does not have any virtual
/// register use.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 06ad445f865ff8..2077941b3343e0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -371,6 +371,10 @@ unsigned GCNSubtarget::getNumVGPRsToIncreaseOccupancy(unsigned NumVGPRs) const {
return AMDGPU::IsaInfo::getVGPRReductionToIncreaseWavesPerEU(this, NumVGPRs);
}
+unsigned GCNSubtarget::getNumVGPRsToEliminateSpilling(unsigned NumVGPRs) const {
+ return AMDGPU::IsaInfo::getVGPRReductionToEliminateSpilling(this, NumVGPRs);
+}
+
unsigned
GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 99e163a5661fe1..5e82dfb7271ee7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1373,6 +1373,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// already results in maximum occupancy.
unsigned getNumVGPRsToIncreaseOccupancy(unsigned VGPRs) const;
+ /// Returns the necessary reduction in number of VGPRs from using \p VGPRs
+ /// VGPRs to eliminate spilling. Returns 0 when using \p VGPRs VGPRs does not
+ /// result in spilling.
+ unsigned getNumVGPRsToEliminateSpilling(unsigned VGPRs) const;
+
/// Return occupancy for the given function. Used LDS and a number of
/// registers if provided.
/// Note, occupancy can be affected by the scratch allocation as well, but
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 67332b25a45e08..091bf0f1495924 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1198,6 +1198,15 @@ unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
return NumVGPRs - alignDown(TotalNumVGPRs / (NumWaves + 1), Granule);
}
+unsigned getVGPRReductionToEliminateSpilling(const MCSubtargetInfo *STI,
+ unsigned NumVGPRs) {
+ unsigned Granule = getVGPRAllocGranule(STI);
+ unsigned TotalNumVGPRs = getTotalNumVGPRs(STI);
+ if (alignTo(NumVGPRs, Granule) < TotalNumVGPRs)
+ return 0;
+ return NumVGPRs - alignDown(TotalNumVGPRs, Granule);
+}
+
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
AMDGPUSubtarget::Generation Gen) {
if (Gen >= AMDGPUSubtarget::GFX10)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 375a02f242e33b..583a447befd4c5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -328,10 +328,15 @@ unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
/// to increase the achievable number of waves per EU for this subtarget by 1.
/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of
/// waves per EU.
-
unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
unsigned NumVGPRs);
+/// Returns the necessary reduction in number of VGPRs from using \p VGPRs VGPRs
+/// to eliminate spilling for this subtarget. Returns 0 when using \p VGPRs
+/// VGPRs does not cause any spilling to happen.
+unsigned getVGPRReductionToEliminateSpilling(const MCSubtargetInfo *STI,
+ unsigned NumVGPRs);
+
/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 0a9978dea0461a..df4410489e9cc7 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -4949,6 +4949,605 @@ body: |
S_ENDPGM 0
...
---
+name: test_occ_1_sink_for_spill
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX908-LABEL: name: test_occ_1_sink_for_spill
+ ; GFX908: bb.0:
+ ; GFX908-NEXT: successors: %bb.1(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.1:
+ ; GFX908-NEXT: successors: %bb.2(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.2:
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_10]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]], implicit [[V_CVT_I32_F64_e32_40]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]], implicit [[V_CVT_I32_F64_e32_50]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]], implicit [[V_CVT_I32_F64_e32_60]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]], implicit [[V_CVT_I32_F64_e32_70]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]], implicit [[V_CVT_I32_F64_e32_80]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]], implicit [[V_CVT_I32_F64_e32_90]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]], implicit [[V_CVT_I32_F64_e32_100]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]], implicit [[V_CVT_I32_F64_e32_110]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]], implicit [[V_CVT_I32_F64_e32_120]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]], implicit [[V_CVT_I32_F64_e32_130]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]], implicit [[V_CVT_I32_F64_e32_140]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]], implicit [[V_CVT_I32_F64_e32_150]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]], implicit [[V_CVT_I32_F64_e32_160]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]], implicit [[V_CVT_I32_F64_e32_170]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]], implicit [[V_CVT_I32_F64_e32_180]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]], implicit [[V_CVT_I32_F64_e32_190]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]], implicit [[V_CVT_I32_F64_e32_200]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]], implicit [[V_CVT_I32_F64_e32_210]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]], implicit [[V_CVT_I32_F64_e32_220]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]], implicit [[V_CVT_I32_F64_e32_230]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]], implicit [[V_CVT_I32_F64_e32_240]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]], implicit [[V_CVT_I32_F64_e32_250]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_256]], implicit [[V_CVT_I32_F64_e32_]]
+ ; GFX908-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ %255:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+ %256:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+
+ bb.1:
+ successors: %bb.2
+
+ %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+ %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+ %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+ %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+ %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+ %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+ %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+ %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+ %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+ %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+ %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+ %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+ %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+ %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+ %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+ %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+ %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+ %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+ %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+ %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+ %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+ %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+ %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+ %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+ %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+ %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+ %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+ %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+ %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+ %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+ %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+ %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+ %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+ %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+ %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+ %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+ %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+ %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+ %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+ %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+ %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+ %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+ %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+ %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+ %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+ %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+ %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+ %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+ %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+ %66:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+ %67:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+ %68:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+ %69:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+ %70:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+ %71:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+ %72:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+ %73:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+ %74:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+ %75:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+ %76:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+ %77:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+ %78:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+ %79:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+ %80:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+ %81:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+ %82:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+ %83:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+ %84:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+ %85:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+ %86:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+ %87:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+ %88:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+ %89:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+ %90:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+ %91:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+ %92:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+ %93:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+ %94:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+ %95:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+ %96:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+ %97:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+ %98:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+ %99:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+ %100:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+ %101:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+ %102:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+ %103:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+ %104:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+ %105:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+ %106:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+ %107:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+ %108:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+ %109:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+ %110:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+ %111:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+ %112:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+ %113:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+ %114:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+ %115:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+ %116:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+ %117:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+ %118:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+ %119:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+ %120:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+ %121:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+ %122:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+ %123:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+ %124:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+ %125:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+ %126:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+ %127:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+ %128:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+ %129:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+ %130:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+ %131:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+ %132:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+ %133:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+ %134:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+ %135:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+ %136:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+ %137:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+ %138:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+ %139:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+ %140:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+ %141:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+ %142:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+ %143:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+ %144:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+ %145:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+ %146:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+ %147:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+ %148:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+ %149:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+ %150:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+ %151:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+ %152:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+ %153:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+ %154:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+ %155:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+ %156:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+ %157:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+ %158:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+ %159:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+ %160:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+ %161:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+ %162:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+ %163:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+ %164:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+ %165:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+ %166:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+ %167:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+ %168:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+ %169:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+ %170:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+ %171:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+ %172:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+ %173:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+ %174:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+ %175:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+ %176:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+ %177:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+ %178:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+ %179:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+ %180:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+ %181:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+ %182:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+ %183:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+ %184:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+ %185:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+ %186:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+ %187:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+ %188:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+ %189:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+ %190:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+ %191:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+ %192:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+ %193:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+ %194:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+ %195:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+ %196:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+ %197:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+ %198:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+ %199:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+ %200:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+ %201:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+ %202:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+ %203:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+ %204:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+ %205:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+ %206:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+ %207:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+ %208:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+ %209:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+ %210:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+ %211:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+ %212:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+ %213:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+ %214:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+ %215:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+ %216:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+ %217:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+ %218:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+ %219:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+ %220:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+ %221:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+ %222:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+ %223:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+ %224:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+ %225:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+ %226:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+ %227:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+ %228:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+ %229:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+ %230:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+ %231:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+ %232:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+ %233:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+ %234:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+ %235:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+ %236:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+ %237:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+ %238:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+ %239:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+ %240:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+ %241:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+ %242:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+ %243:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+ %244:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+ %245:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+ %246:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+ %247:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+ %248:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+ %249:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+ %250:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+ %251:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+ %252:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+ %253:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+ %254:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+
+ bb.2:
+
+ S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+ S_NOP 0, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19
+ S_NOP 0, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29
+ S_NOP 0, implicit %30, implicit %31, implicit %32, implicit %33, implicit %34, implicit %35, implicit %36, implicit %37, implicit %38, implicit %39
+ S_NOP 0, implicit %40, implicit %41, implicit %42, implicit %43, implicit %44, implicit %45, implicit %46, implicit %47, implicit %48, implicit %49
+ S_NOP 0, implicit %50, implicit %51, implicit %52, implicit %53, implicit %54, implicit %55, implicit %56, implicit %57, implicit %58, implicit %59
+ S_NOP 0, implicit %60, implicit %61, implicit %62, implicit %63, implicit %64, implicit %65, implicit %66, implicit %67, implicit %68, implicit %69
+ S_NOP 0, implicit %70, implicit %71, implicit %72, implicit %73, implicit %74, implicit %75, implicit %76, implicit %77, implicit %78, implicit %79
+ S_NOP 0, implicit %80, implicit %81, implicit %82, implicit %83, implicit %84, implicit %85, implicit %86, implicit %87, implicit %88, implicit %89
+ S_NOP 0, implicit %90, implicit %91, implicit %92, implicit %93, implicit %94, implicit %95, implicit %96, implicit %97, implicit %98, implicit %99
+ S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
+ S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
+ S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
+ S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
+ S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
+ S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
+ S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
+ S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
+ S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
+ S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
+ S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
+ S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
+ S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
+ S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
+ S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
+ S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254
+
+ S_NOP 0, implicit %255, implicit %256
+
+ S_ENDPGM 0
+...
+---
name: test_no_sink_two_subregs_in_def_block
tracksRegLiveness: true
machineFunctionInfo:
More information about the llvm-commits
mailing list