[llvm] [AMDGPU][Scheduler] Refactor ArchVGPR rematerialization during scheduling (PR #125885)

Lucas Ramirez via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 27 04:14:01 PST 2025


https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/125885

>From e75789af053e3bab953f88916a1b7033fdbe50af Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 4 Dec 2024 15:49:33 +0100
Subject: [PATCH 1/8] Integrate #124327

---
 .../llvm/CodeGen/MachineRegisterInfo.h        |    4 +
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      |    5 +
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  754 +++--
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  105 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       |   53 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  600 +---
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   40 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   15 +
 ...ine-scheduler-sink-trivial-remats-attr.mir |  221 ++
 ...ne-scheduler-sink-trivial-remats-debug.mir |    4 +-
 .../machine-scheduler-sink-trivial-remats.mir | 2752 +++++++++--------
 11 files changed, 2367 insertions(+), 2186 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 1c465741cb462..cbf0347017d81 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/RegisterBank.h"
@@ -592,6 +593,9 @@ class MachineRegisterInfo {
   /// multiple uses.
   bool hasOneNonDBGUser(Register RegNo) const;
 
+  /// If the register has a single non-Debug instruction using the specified
+  /// register, returns it; otherwise returns nullptr.
+  MachineInstr *getOneNonDBGUser(Register RegNo) const;
 
   /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
   /// non-debug user instructions.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 937f63f6c5e00..b7135251781ad 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -432,6 +432,11 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
   return hasSingleElement(use_nodbg_instructions(RegNo));
 }
 
+MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
+  auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
+  return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
+}
+
 bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
                                               unsigned MaxUsers) const {
   return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 176586e3fbbb6..01cbd0aefba39 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,8 +25,13 @@
 
 #include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
+#include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/ErrorHandling.h"
 
 #define DEBUG_TYPE "machine-scheduler"
 
@@ -307,11 +312,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
     HasHighPressure = true;
     if (SGPRDelta > VGPRDelta) {
       Cand.RPDelta.CriticalMax =
-        PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
+          PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
     } else {
       Cand.RPDelta.CriticalMax =
-        PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
+          PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
     }
   }
@@ -324,7 +329,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const RegPressureTracker &RPTracker,
                                          SchedCandidate &Cand,
                                          bool IsBottomUp) {
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
@@ -420,7 +425,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
                         /*IsBottomUp=*/false);
       assert(TCand.SU == TopCand.SU &&
-           "Last pick result should correspond to re-picking right now");
+             "Last pick result should correspond to re-picking right now");
     }
 #endif
   }
@@ -890,13 +895,13 @@ GCNScheduleDAGMILive::getRegionLiveInMap() const {
   std::vector<MachineInstr *> RegionFirstMIs;
   RegionFirstMIs.reserve(Regions.size());
   auto I = Regions.rbegin(), E = Regions.rend();
-  auto *BB = I->first->getParent();
   do {
+    const MachineBasicBlock *MBB = I->first->getParent();
     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
     RegionFirstMIs.push_back(MI);
     do {
       ++I;
-    } while (I != E && I->first->getParent() == BB);
+    } while (I != E && I->first->getParent() == MBB);
   } while (I != E);
   return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
 }
@@ -1081,31 +1086,32 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
   return true;
 }
 
-bool PreRARematStage::initGCNSchedStage() {
-  if (!GCNSchedStage::initGCNSchedStage())
-    return false;
-
-  if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
-    return false;
-
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  // Rematerialization will not help if occupancy is not limited by reg usage.
-  if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
-    return false;
+/// Allows to easily filter for this stage's debug output.
+#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
 
-  // FIXME: This pass will invalidate cached MBBLiveIns for regions
-  // inbetween the defs and region we sinked the def to. Cached pressure
-  // for regions where a def is sinked from will also be invalidated. Will
-  // need to be fixed if there is another pass after this pass.
+bool PreRARematStage::initGCNSchedStage() {
+  // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
+  // regions inbetween the defs and region we sinked the def to. Will need to be
+  // fixed if there is another pass after this pass.
   assert(!S.hasNextStage());
 
-  collectRematerializableInstructions();
-  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+  if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
+      DAG.Regions.size() == 1 || !canIncreaseOccupancyOrReduceSpill())
     return false;
 
-  LLVM_DEBUG(
-      dbgs() << "Retrying function scheduling with improved occupancy of "
-             << DAG.MinOccupancy << " from rematerializing\n");
+  // Rematerialize identified instructions and update scheduler's state.
+  rematerialize();
+  if (GCNTrackers)
+    DAG.RegionLiveOuts.buildLiveRegMap();
+  REMAT_DEBUG(
+      dbgs() << "Retrying function scheduling with new min. occupancy of "
+             << AchievedOcc << " from rematerializing (original was "
+             << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+  if (AchievedOcc > DAG.MinOccupancy) {
+    DAG.MinOccupancy = AchievedOcc;
+    SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+    MFI.increaseOccupancy(MF, DAG.MinOccupancy);
+  }
   return true;
 }
 
@@ -1397,8 +1403,7 @@ GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
 #ifndef NDEBUG
   LLVM_DEBUG(
       printScheduleModel(ReadyCyclesSorted);
-      dbgs() << "\n\t"
-             << "Metric: "
+      dbgs() << "\n\t" << "Metric: "
              << (SumBubbles
                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
                      : 1)
@@ -1433,8 +1438,7 @@ GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
 #ifndef NDEBUG
   LLVM_DEBUG(
       printScheduleModel(ReadyCyclesSorted);
-      dbgs() << "\n\t"
-             << "Metric: "
+      dbgs() << "\n\t" << "Metric: "
              << (SumBubbles
                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
                      : 1)
@@ -1482,8 +1486,7 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
       dbgs()
       << "\n\t      *** In shouldRevertScheduling ***\n"
       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
-  ScheduleMetrics MBefore =
-      getScheduleMetrics(DAG.SUnits);
+  ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
   LLVM_DEBUG(
       dbgs()
       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
@@ -1516,13 +1519,9 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
 }
 
 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
-  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
-    return true;
-
-  if (mayCauseSpilling(WavesAfter))
-    return true;
-
-  return false;
+  return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
+         mayCauseSpilling(WavesAfter) ||
+         (IncreaseOccupancy && WavesAfter < TargetOcc);
 }
 
 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1673,160 +1672,235 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
   return true;
 }
 
-void PreRARematStage::collectRematerializableInstructions() {
+bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
-  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
-    Register Reg = Register::index2VirtReg(I);
-    if (!DAG.LIS->hasInterval(Reg))
-      continue;
-
-    // TODO: Handle AGPR and SGPR rematerialization
-    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
-        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
-      continue;
-
-    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
-    MachineInstr *Def = Op->getParent();
-    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
-      continue;
-
-    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
-    if (Def->getParent() == UseI->getParent())
-      continue;
 
-    bool HasRematDependency = false;
-    // Check if this instruction uses any registers that are planned to be
-    // rematerialized
-    for (auto &RematEntry : RematerializableInsts) {
-      if (find_if(RematEntry.second,
-                  [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
-                    for (MachineOperand &MO : Def->operands()) {
-                      if (!MO.isReg())
-                        continue;
-                      if (MO.getReg() == Remat.first->getOperand(0).getReg())
-                        return true;
-                    }
-                    return false;
-                  }) != RematEntry.second.end()) {
-        HasRematDependency = true;
-        break;
+  REMAT_DEBUG(dbgs() << "Collecting rematerializable instructions in "
+                     << MF.getFunction().getName() << '\n');
+
+  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+  // occupancy, or regions with VGPR spilling) to their excess RP.
+  DenseMap<unsigned, unsigned> OptRegions;
+
+  // Note that the maximum number of VGPRs to use to eliminate spill may be
+  // lower than the maximum number to increase occupancy when the function has
+  // the "amdgpu-num-vgpr" attribute.
+  const std::pair<unsigned, unsigned> OccBounds =
+      ST.getOccupancyWithWorkGroupSizes(MF);
+  // FIXME: we should be able to just call ST.getMaxNumArchVGPRs() but that
+  // would use the occupancy bounds as determined by
+  // MF.getFunction().getWavesPerEU(), which look incorrect in some cases.
+  const unsigned MaxVGPRsNoSpill =
+      ST.getBaseMaxNumVGPRs(MF.getFunction(),
+                            {ST.getMinNumArchVGPRs(OccBounds.second),
+                             ST.getMaxNumArchVGPRs(OccBounds.first)},
+                            false);
+  const unsigned MaxVGPRsIncOcc = ST.getMaxNumArchVGPRs(DAG.MinOccupancy + 1);
+  IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy;
+
+  // Collect optimizable regions. If there is spilling in any region we will
+  // just try to reduce it. Otherwise we will try to increase occupancy by one.
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    GCNRegPressure &RP = DAG.Pressure[I];
+    unsigned NumVGPRs = RP.getArchVGPRNum();
+    unsigned ExcessRP = 0;
+    if (NumVGPRs > MaxVGPRsNoSpill) {
+      if (IncreaseOccupancy) {
+        // We won't try to increase occupancy.
+        IncreaseOccupancy = false;
+        OptRegions.clear();
       }
-    }
-    // Do not rematerialize an instruction if it uses an instruction that we
-    // have designated for rematerialization.
-    // FIXME: Allow for rematerialization chains: this requires 1. updating
-    // remat points to account for uses that are rematerialized, and 2. either
-    // rematerializing the candidates in careful ordering, or deferring the MBB
-    // RP walk until the entire chain has been rematerialized.
-    if (HasRematDependency)
-      continue;
-
-    // Similarly, check if the UseI is planned to be remat.
-    for (auto &RematEntry : RematerializableInsts) {
-      if (find_if(RematEntry.second,
-                  [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
-                    return Remat.first == UseI;
-                  }) != RematEntry.second.end()) {
-        HasRematDependency = true;
-        break;
-      }
-    }
-
-    if (HasRematDependency)
-      break;
-
-    // We are only collecting defs that are defined in another block and are
-    // live-through or used inside regions at MinOccupancy. This means that the
-    // register must be in the live-in set for the region.
-    bool AddedToRematList = false;
-    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-      auto It = DAG.LiveIns[I].find(Reg);
-      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
-        if (DAG.RegionsWithMinOcc[I]) {
-          SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
-          SlotIndex UseIdx =
-              DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
-          if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
-            RematerializableInsts[I][Def] = UseI;
-            AddedToRematList = true;
-          }
-        }
-
-        // Collect regions with rematerializable reg as live-in to avoid
-        // searching later when updating RP.
-        RematDefToLiveInRegions[Def].push_back(I);
+      // Region has VGPR spilling, we will try to reduce spilling as much as
+      // possible.
+      ExcessRP = NumVGPRs - MaxVGPRsNoSpill;
+      REMAT_DEBUG(dbgs() << "Region " << I << " is spilling VGPRs, save "
+                         << ExcessRP << " VGPR(s) to eliminate spilling\n");
+    } else if (IncreaseOccupancy) {
+      if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy) {
+        // Occupancy is SGPR-limited in the region, no point in trying to
+        // increase it through VGPR usage.
+        IncreaseOccupancy = false;
+        OptRegions.clear();
+      } else if (NumVGPRs > MaxVGPRsIncOcc) {
+        // Occupancy is VGPR-limited.
+        ExcessRP = NumVGPRs - MaxVGPRsIncOcc;
+        REMAT_DEBUG(dbgs() << "Region " << I << " has min. occupancy: save "
+                           << ExcessRP << " VGPR(s) to improve occupancy\n");
       }
     }
-    if (!AddedToRematList)
-      RematDefToLiveInRegions.erase(Def);
+    if (ExcessRP)
+      OptRegions.insert({I, ExcessRP});
   }
-}
+  if (OptRegions.empty())
+    return false;
 
-bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                                              const TargetInstrInfo *TII) {
-  // Temporary copies of cached variables we will be modifying and replacing if
-  // sinking succeeds.
-  SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
-      NewRegions;
-  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
-  DenseMap<unsigned, GCNRegPressure> NewPressure;
-  BitVector NewRescheduleRegions;
-  LiveIntervals *LIS = DAG.LIS;
+  // When we are reducing spilling, the target is the minimum achievable
+  // occupancy implied by workgroup sizes.
+  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first;
+
+  // Accounts for a reduction in RP in an optimizable region. Returns whether we
+  // estimate that we have identified enough rematerialization opportunities to
+  // achieve our goal.
+  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
+    auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+    unsigned I = OptIt->getFirst();
+    unsigned &Excess = OptIt->getSecond();
+    if (NumRegs >= Excess)
+      OptRegions.erase(I);
+    else
+      Excess -= NumRegs;
+    return OptRegions.empty();
+  };
+
+  // We need up-to-date live-out info. to query live-out register masks in
+  // regions containing rematerializable instructions.
+  DAG.RegionLiveOuts.buildLiveRegMap();
+
+  // Cache set of registers that are going to be rematerialized.
+  DenseSet<unsigned> RematRegs;
+
+  // identify rematerializable instructions in the function.
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    auto Region = DAG.Regions[I];
+    for (auto MI = Region.first; MI != Region.second; ++MI) {
+      // The instruction must be trivially rematerializable.
+      MachineInstr &DefMI = *MI;
+      if (!isTriviallyReMaterializable(DefMI))
+        continue;
 
-  NewRegions.resize(DAG.Regions.size());
-  NewRescheduleRegions.resize(DAG.Regions.size());
+      // We only support rematerializing virtual VGPRs with one definition.
+      Register Reg = DefMI.getOperand(0).getReg();
+      if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
+          !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+          !DAG.MRI.hasOneDef(Reg))
+        continue;
 
-  // Collect only regions that has a rematerializable def as a live-in.
-  SmallSet<unsigned, 16> ImpactedRegions;
-  for (const auto &It : RematDefToLiveInRegions)
-    ImpactedRegions.insert(It.second.begin(), It.second.end());
+      // We only care to rematerialize the instruction if it has a single
+      // non-debug user in a different block.
+      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
+      if (!UseMI || DefMI.getParent() == UseMI->getParent())
+        continue;
 
-  // Make copies of register pressure and live-ins cache that will be updated
-  // as we rematerialize.
-  for (auto Idx : ImpactedRegions) {
-    NewPressure[Idx] = DAG.Pressure[Idx];
-    NewLiveIns[Idx] = DAG.LiveIns[Idx];
-  }
-  NewRegions = DAG.Regions;
-  NewRescheduleRegions.reset();
+      // Do not rematerialize an instruction if it uses or is used by an
+      // instruction that we have designated for rematerialization.
+      // FIXME: Allow for rematerialization chains: this requires 1. updating
+      // remat points to account for uses that are rematerialized, and 2. either
+      // rematerializing the candidates in careful ordering, or deferring the
+      // MBB RP walk until the entire chain has been rematerialized.
+      if (Rematerializations.contains(UseMI) ||
+          llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
+            return MO.isReg() && RematRegs.contains(MO.getReg());
+          }))
+        continue;
 
-  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
-  bool Improved = false;
-  for (auto I : ImpactedRegions) {
-    if (!DAG.RegionsWithMinOcc[I])
-      continue;
+      // Do not rematerialize an instruction it it uses registers that aren't
+      // available at its use. This ensures that we are not extending any live
+      // range while rematerializing.
+      SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
+      SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
+      if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
+        continue;
 
-    Improved = false;
-    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
-    int SGPRUsage = NewPressure[I].getSGPRNum();
+      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
+      RematInstruction &Remat =
+          Rematerializations.try_emplace(&DefMI, I, UseMI).first->second;
+
+      bool RematUseful = false;
+      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
+        // Optimistically consider that moving the instruction out of its
+        // defining region will reduce RP in the latter; this assumes that
+        // maximum RP in the region is reached somewhere between the defining
+        // instruction and the end of the region.
+        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
+        RematUseful = true;
+        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+        if (ReduceRPInRegion(It, Mask))
+          return true;
+      }
 
-    // TODO: Handle occupancy drop due to AGPR and SGPR.
-    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
-    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
-      break;
+      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
+        // We are only collecting regions in which the register is a live-in
+        // (and may be live-through).
+        auto It = DAG.LiveIns[LIRegion].find(Reg);
+        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
+          continue;
+        Remat.LiveInRegions.insert(LIRegion);
+
+        // Account for the reduction in RP due to the rematerialization in an
+        // optimizable region in which the defined register is a live-in. This
+        // is exact for live-through region but optimistic in the using region,
+        // where RP is actually reduced only if maximum RP is reached somewhere
+        // between the beginning of the region and the rematerializable
+        // instruction's use.
+        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
+          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
+          RematUseful = true;
+          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg]))
+            return true;
+        }
+      }
 
-    // The occupancy of this region could have been improved by a previous
-    // iteration's sinking of defs.
-    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
-      NewRescheduleRegions[I] = true;
-      Improved = true;
-      continue;
+      // If the instruction is not a live-in or live-out in any optimizable
+      // region then there is no point in rematerializing it.
+      if (!RematUseful) {
+        Rematerializations.pop_back();
+        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
+      } else {
+        RematRegs.insert(Reg);
+      }
     }
+  }
+
+  if (IncreaseOccupancy) {
+    // We were trying to increase occupancy but failed, abort the stage.
+    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
+    Rematerializations.clear();
+    return false;
+  }
+  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
+  return !Rematerializations.empty();
+}
+
+void PreRARematStage::rematerialize() {
+  const auto *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  // Collect regions whose RP changes in unpredictable way; we will have to
+  // fully recompute their RP after all rematerailizations.
+  DenseSet<unsigned> RecomputeRP;
+  SlotIndexes *Slots = DAG.LIS->getSlotIndexes();
+
+  // Rematerialize all instructions.
+  for (auto &[DefMI, Remat] : Rematerializations) {
+    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
+    Register Reg = DefMI->getOperand(0).getReg();
+    unsigned SubReg = DefMI->getOperand(0).getSubReg();
+
+    // Rematerialize DefMI to its use block.
+    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
+                       *DAG.TRI);
+    Remat.RematMI = &*std::prev(InsertPos);
+    Remat.RematMI->getOperand(0).setSubReg(SubReg);
+    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
+
+    // Update region boundaries in regions we sinked from (remove defining MI)
+    // and to (insert MI rematerialized in use block). Only then we can erase
+    // the original MI.
+    DAG.updateRegionBoundaries(DAG.Regions, DefMI, nullptr);
+    DAG.updateRegionBoundaries(DAG.Regions, InsertPos, Remat.RematMI);
+    DefMI->eraseFromParent();
+    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
+
+    // Collect all regions impacted by the rematerialization and update their
+    // live-in/RP information.
+    for (unsigned I : Remat.LiveInRegions) {
+      ImpactedRegions.insert({I, DAG.Pressure[I]});
+      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
 
-    // First check if we have enough trivially rematerializable instructions to
-    // improve occupancy. Optimistically assume all instructions we are able to
-    // sink decreased RP.
-    int TotalSinkableRegs = 0;
-    for (const auto &It : RematerializableInsts[I]) {
-      MachineInstr *Def = It.first;
-      Register DefReg = Def->getOperand(0).getReg();
-      TotalSinkableRegs +=
-          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
 #ifdef EXPENSIVE_CHECKS
       // All uses are known to be available / live at the remat point. Thus, the
       // uses should already be live in to the region.
-      for (MachineOperand &MO : Def->operands()) {
+      for (MachineOperand &MO : DefMI->operands()) {
         if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
           continue;
 
@@ -1834,13 +1908,13 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
         if (!UseReg.isVirtual())
           continue;
 
-        LiveInterval &LI = LIS->getInterval(UseReg);
+        LiveInterval &LI = DAG.LIS->getInterval(UseReg);
         LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
         if (LI.hasSubRanges() && MO.getSubReg())
           LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
 
-        assert(NewLiveIns[I].contains(UseReg));
-        LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
+        assert(RegionLiveIns.contains(UseReg));
+        LaneBitmask LiveInMask = RegionLiveIns[UseReg];
         LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
         // If this register has lanes not covered by the LiveIns, be sure they
         // do not map to any subrange. ref:
@@ -1852,130 +1926,104 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
         }
       }
 #endif
-    }
-    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
-    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
-    // If in the most optimistic scenario, we cannot improve occupancy, then do
-    // not attempt to sink any instructions.
-    if (OptimisticOccupancy <= DAG.MinOccupancy)
-      break;
 
-    unsigned ImproveOccupancy = 0;
-    SmallVector<MachineInstr *, 4> SinkedDefs;
-    for (auto &It : RematerializableInsts[I]) {
-      MachineInstr *Def = It.first;
-      MachineBasicBlock::iterator InsertPos =
-          MachineBasicBlock::iterator(It.second);
-      Register Reg = Def->getOperand(0).getReg();
-      // Rematerialize MI to its use block.
-      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
-      MachineInstr *NewMI = &*std::prev(InsertPos);
-      LIS->InsertMachineInstrInMaps(*NewMI);
-      LIS->removeInterval(Reg);
-      LIS->createAndComputeVirtRegInterval(Reg);
-      InsertedMIToOldDef[NewMI] = Def;
-
-      // Update region boundaries in scheduling region we sinked from since we
-      // may sink an instruction that was at the beginning or end of its region
-      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
-                                 /*Removing =*/true);
-
-      // Update region boundaries in region we sinked to.
-      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
-
-      LaneBitmask PrevMask = NewLiveIns[I][Reg];
-      // FIXME: Also update cached pressure for where the def was sinked from.
-      // Update RP for all regions that has this reg as a live-in and remove
-      // the reg from all regions as a live-in.
-      for (auto Idx : RematDefToLiveInRegions[Def]) {
-        NewLiveIns[Idx].erase(Reg);
-        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
-          // Def is live-through and not used in this block.
-          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
-        } else {
-          // Def is used and rematerialized into this block.
-          GCNDownwardRPTracker RPT(*LIS);
-          auto *NonDbgMI = &*skipDebugInstructionsForward(
-              NewRegions[Idx].first, NewRegions[Idx].second);
-          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
-          RPT.advance(NewRegions[Idx].second);
-          NewPressure[Idx] = RPT.moveMaxPressure();
-        }
-      }
-
-      SinkedDefs.push_back(Def);
-      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
-      if (ImproveOccupancy > DAG.MinOccupancy)
-        break;
+      // The register is no longer a live-in in all regions but the one that
+      // contains the single use. In live-through regions, maximum register
+      // pressure decreases predictably so we can directly update it. In the
+      // using region, maximum RP may or may not decrease, so we will mark it
+      // for re-computation after all materializations have taken place.
+      LaneBitmask PrevMask = RegionLiveIns[Reg];
+      RegionLiveIns.erase(Reg);
+      RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
+      if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
+        DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+      else
+        RecomputeRP.insert(I);
     }
-
-    // Remove defs we just sinked from all regions' list of sinkable defs
-    for (auto &Def : SinkedDefs)
-      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
-        RematerializableInsts[TrackedIdx].erase(Def);
-
-    if (ImproveOccupancy <= DAG.MinOccupancy)
-      break;
-
-    NewRescheduleRegions[I] = true;
-    Improved = true;
-  }
-
-  if (!Improved) {
-    // Occupancy was not improved for all regions that were at MinOccupancy.
-    // Undo sinking and remove newly rematerialized instructions.
-    for (auto &Entry : InsertedMIToOldDef) {
-      MachineInstr *MI = Entry.first;
-      MachineInstr *OldMI = Entry.second;
-      Register Reg = MI->getOperand(0).getReg();
-      LIS->RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
-      OldMI->clearRegisterDeads(Reg);
-      LIS->removeInterval(Reg);
-      LIS->createAndComputeVirtRegInterval(Reg);
+    // RP in the region from which the instruction was rematerialized may or may
+    // not decrease.
+    ImpactedRegions.insert({Remat.DefRegion, DAG.Pressure[Remat.DefRegion]});
+    RecomputeRP.insert(Remat.DefRegion);
+
+    // Update the register's live interval manually. This is aligned with the
+    // instruction collection phase in that it assumes a single def and use
+    // (possibly subreg).
+    SlotIndex NewDef = Slots->getInstructionIndex(*Remat.RematMI).getRegSlot();
+    SlotIndex NewKill = Slots->getInstructionIndex(*Remat.UseMI).getRegSlot();
+
+    // Update the live range to reflect the register's rematerialization. We
+    // will have a single segment (we only remat regs with one use) and at most
+    // a single value number (we only remat virtual regs with a single def).
+    auto UpdateLiveRange = [&](LiveRange &LR) -> void {
+      assert(LR.segments.size() && "expected at least one segment");
+      assert(LR.valnos.size() < 2 && "expected at most one value number");
+
+      // Fix up the first segment and remove the others which have become
+      // unnecessary by construction.
+      LR.segments.truncate(1);
+      LiveRange::Segment &Seg = LR.segments.front();
+      Seg.start = NewDef;
+      Seg.end = NewKill;
+      LR.valnos.clear();
+      if (Seg.valno) {
+        // If present, update the segment's value number as well.
+        Seg.valno->def = NewDef;
+        LR.valnos.push_back(Seg.valno);
+      }
+    };
+
+    Register RematReg = Remat.RematMI->getOperand(0).getReg();
+    LiveInterval &RegLI = DAG.LIS->getInterval(RematReg);
+    UpdateLiveRange(RegLI);
+    if (RegLI.hasSubRanges()) {
+      LiveInterval::SubRange &SubRange = *RegLI.subrange_begin();
+      assert(!SubRange.Next && "expected at most one subrange");
+      UpdateLiveRange(SubRange);
     }
-    return false;
   }
 
-  // Occupancy was improved for all regions.
-  for (auto &Entry : InsertedMIToOldDef) {
-    MachineInstr *MI = Entry.first;
-    MachineInstr *OldMI = Entry.second;
-
-    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
-    DAG.BBLiveInMap.erase(OldMI);
-
-    // Remove OldMI and update LIS
-    Register Reg = MI->getOperand(0).getReg();
-    LIS->RemoveMachineInstrFromMaps(*OldMI);
-    OldMI->eraseFromParent();
-    LIS->removeInterval(Reg);
-    LIS->createAndComputeVirtRegInterval(Reg);
-  }
+  // All regions impacted by at least one rematerialization must be rescheduled.
+  // Maximum pressure must also be recomputed for all regions where it changed
+  // non-predictably and checked against the target occupancy.
+  AchievedOcc = TargetOcc;
+  for (auto &[I, OriginalRP] : ImpactedRegions) {
+    bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
+    DAG.RescheduleRegions[I] = !IsEmptyRegion;
+    if (!RecomputeRP.contains(I))
+      continue;
 
-  // Update live-ins, register pressure, and regions caches.
-  for (auto Idx : ImpactedRegions) {
-    DAG.LiveIns[Idx] = NewLiveIns[Idx];
-    DAG.Pressure[Idx] = NewPressure[Idx];
-    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
+    GCNRegPressure RP;
+    if (IsEmptyRegion) {
+      RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
+    } else {
+      GCNDownwardRPTracker RPT(*DAG.LIS);
+      auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
+                                                      DAG.Regions[I].second);
+      if (NonDbgMI == DAG.Regions[I].second) {
+        // Region is non-empty but contains only debug instructions.
+        RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
+      } else {
+        RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
+        RPT.advance(DAG.Regions[I].second);
+        RP = RPT.moveMaxPressure();
+      }
+    }
+    DAG.Pressure[I] = RP;
+    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
   }
-  DAG.Regions = NewRegions;
-  DAG.RescheduleRegions = NewRescheduleRegions;
-
-  if (GCNTrackers)
-    DAG.RegionLiveOuts.buildLiveRegMap();
-
-  SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-  MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
-
-  return true;
+  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
 }
 
+// Copied from MachineLICM
 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
   if (!DAG.TII->isTriviallyReMaterializable(MI))
     return false;
 
+  // Even though TargetInstrInfo::isReallyTriviallyReMaterializable already
+  // ensures that the instruction has no virtual register uses,
+  // SIInstrInfo::isReallyTriviallyReMaterializable may consider an instruction
+  // rematerializable and return before calling its parent's method, so we need
+  // to double-check here.
   for (const MachineOperand &MO : MI.all_uses()) {
     // We can't remat physreg uses, unless it is a constant or an ignorable
     // use (e.g. implicit exec use on VALU instructions)
@@ -1989,45 +2037,115 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
   return true;
 }
 
-// When removing, we will have to check both beginning and ending of the region.
-// When inserting, we will only have to check if we are inserting NewMI in front
-// of a scheduling region and do not need to check the ending since we will only
-// ever be inserting before an already existing MI.
+/// Identifies the parent MBB of a \p Region. For an empty region, this runs in
+/// linear time in the number of MBBs in \p MF; otherwise it runs in constant
+/// time.
+static MachineBasicBlock *getRegionMBB(MachineFunction &MF,
+                                       RegionBoundaries &Region) {
+  if (Region.first != Region.second)
+    return Region.first->getParent();
+  // The boundaries of an empty region may not point to a valid MI from which we
+  // can get the parent MBB, so we have to iterate over all the MF's MBBs.
+  for (MachineBasicBlock &MBB : MF)
+    if (MBB.end() == Region.second)
+      return &MBB;
+  llvm_unreachable("region's parent MBB cannot be identified");
+}
+
+void PreRARematStage::finalizeGCNSchedStage() {
+  // We consider that reducing spilling is always beneficial so we never
+  // rollback rematerializations in such cases. It's also possible that
+  // rescheduling lowers occupancy over the one achived just through remats, in
+  // which case we do not want to rollback either.
+  if (!IncreaseOccupancy || AchievedOcc == TargetOcc)
+    return;
+
+  REMAT_DEBUG(dbgs() << "Rollbacking all rematerializations\n");
+  const auto *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  // Rollback the rematerializations.
+  for (const auto &[_, Remat] : Rematerializations) {
+    MachineInstr &RematMI = *Remat.RematMI;
+    MachineBasicBlock::iterator InsertPos(DAG.Regions[Remat.DefRegion].second);
+    MachineBasicBlock *MBB = getRegionMBB(MF, DAG.Regions[Remat.DefRegion]);
+    Register Reg = RematMI.getOperand(0).getReg();
+    unsigned SubReg = RematMI.getOperand(0).getSubReg();
+
+    // Re-rematerialize MI at the end of its original region. Note that it may
+    // not be rematerialized exactly in the same position as originally within
+    // the region, but it should not matter much. Since we are only
+    // rematerializing instructions that do not have any virtual reg uses, we
+    // do not need to call LiveRangeEdit::allUsesAvailableAt() and
+    // LiveRangeEdit::canRematerializeAt().
+    TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
+    MachineInstr *NewMI = &*std::prev(InsertPos);
+    NewMI->getOperand(0).setSubReg(SubReg);
+    DAG.LIS->InsertMachineInstrInMaps(*NewMI);
+
+    // Erase rematerialized MI.
+    RematMI.eraseFromParent();
+    DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
+
+    // Recompute live interval for the re-rematerialized register
+    DAG.LIS->removeInterval(Reg);
+    DAG.LIS->createAndComputeVirtRegInterval(Reg);
+
+    // Re-add the register as a live-in in all regions it used to be one in.
+    for (unsigned LIRegion : Remat.LiveInRegions)
+      DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
+  }
+
+  // Reset RP in all impacted regions.
+  for (auto &[I, OriginalRP] : ImpactedRegions)
+    DAG.Pressure[I] = OriginalRP;
+
+  GCNSchedStage::finalizeGCNSchedStage();
+}
+
 void GCNScheduleDAGMILive::updateRegionBoundaries(
-    SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
-                              MachineBasicBlock::iterator>> &RegionBoundaries,
-    MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
+    SmallVectorImpl<RegionBoundaries> &RegionBoundaries,
+    MachineBasicBlock::iterator MI, MachineInstr *NewMI) {
   unsigned I = 0, E = RegionBoundaries.size();
-  // Search for first region of the block where MI is located
-  while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
+  // Search for first region of the block where MI is located. We may encounter
+  // an empty region if all instructions from an initially non-empty region were
+  // removed.
+  while (I != E && RegionBoundaries[I].first != RegionBoundaries[I].second &&
+         MI->getParent() != RegionBoundaries[I].first->getParent())
     ++I;
 
   for (; I != E; ++I) {
-    if (MI->getParent() != RegionBoundaries[I].first->getParent())
+    auto &Bounds = RegionBoundaries[I];
+    assert(MI != Bounds.second && "cannot insert at region end");
+    assert(!NewMI || NewMI != Bounds.second && "cannot remove at region end");
+
+    // We may encounter an empty region if all of the region' instructions were
+    // previously removed.
+    if (Bounds.first == Bounds.second) {
+      if (MI->getParent()->end() != Bounds.second)
+        return;
+      continue;
+    }
+    if (MI->getParent() != Bounds.first->getParent())
       return;
 
-    if (Removing && MI == RegionBoundaries[I].first &&
-        MI == RegionBoundaries[I].second) {
-      // MI is in a region with size 1, after removing, the region will be
-      // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
-      RegionBoundaries[I] =
-          std::pair(MI->getParent()->end(), MI->getParent()->end());
-      return;
-    }
-    if (MI == RegionBoundaries[I].first) {
-      if (Removing)
-        RegionBoundaries[I] =
-            std::pair(std::next(MI), RegionBoundaries[I].second);
+    // We only care for modifications at the beginning of the region since the
+    // upper region boundary is exclusive.
+    if (MI != Bounds.first)
+      continue;
+    if (!NewMI) {
+      // This is an MI removal, which may leave the region empty; in such cases
+      // set both boundaries to the removed instruction's MBB's end.
+      MachineBasicBlock::iterator NextMI = std::next(MI);
+      if (NextMI != Bounds.second)
+        Bounds.first = NextMI;
       else
-        // Inserted NewMI in front of region, set new RegionBegin to NewMI
-        RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
-                                        RegionBoundaries[I].second);
-      return;
-    }
-    if (Removing && MI == RegionBoundaries[I].second) {
-      RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
-      return;
+        Bounds.first = Bounds.second;
+    } else {
+      // This is an MI insertion at the beggining of the region.
+      Bounds.first = NewMI;
     }
+    return;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 7d3e63df43da6..2d6440b1b0730 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,7 +14,9 @@
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 
 #include "GCNRegPressure.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
 namespace llvm {
@@ -214,6 +216,11 @@ class RegionPressureMap {
   }
 };
 
+/// A region's boundaries i.e. a pair of instruction bundle iterators. The lower
+/// boundary is inclusive, the upper boundary is exclusive.
+using RegionBoundaries =
+    std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>;
+
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
@@ -234,8 +241,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   unsigned MinOccupancy;
 
   // Vector of regions recorder for later rescheduling
-  SmallVector<std::pair<MachineBasicBlock::iterator,
-                        MachineBasicBlock::iterator>, 32> Regions;
+  SmallVector<RegionBoundaries, 32> Regions;
 
   // Records if a region is not yet scheduled, or schedule has been reverted,
   // or we generally desire to reschedule it.
@@ -286,12 +292,14 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Compute and cache live-ins and pressure for all regions in block.
   void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
 
-  // Update region boundaries when removing MI or inserting NewMI before MI.
-  void updateRegionBoundaries(
-      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
-                                MachineBasicBlock::iterator>> &RegionBoundaries,
-      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
-      bool Removing = false);
+  /// If necessary, updates a region's boundaries following insertion ( \p NewMI
+  /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
+  /// For an MI removal, this must be called before the MI is actually erased
+  /// from its parent MBB. If a region is left empty by a removal, both
+  /// boundaries are set to the last removed MI's MBB's end.
+  void
+  updateRegionBoundaries(SmallVectorImpl<RegionBoundaries> &RegionBoundaries,
+                         MachineBasicBlock::iterator MI, MachineInstr *NewMI);
 
   void runSchedStages();
 
@@ -431,30 +439,71 @@ class ClusteredLowOccStage : public GCNSchedStage {
       : GCNSchedStage(StageID, DAG) {}
 };
 
+/// Attempts to reduce function spilling or, if there is no spilling, to
+/// increase function occupancy by one with respect to ArchVGPR usage by sinking
+/// trivially rematerializable instructions to their use. When the stage
+/// estimates reducing spilling or increasing occupancy is possible, as few
+/// instructions as possible are rematerialized to reduce potential negative
+/// effects on function latency.
+///
+/// TODO: We should extend this to work on SGPRs and AGPRs as well.
 class PreRARematStage : public GCNSchedStage {
 private:
-  // Each region at MinOccupancy will have their own list of trivially
-  // rematerializable instructions we can remat to reduce RP. The list maps an
-  // instruction to the position we should remat before, usually the MI using
-  // the rematerializable instruction.
-  MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
-      RematerializableInsts;
-
-  // Map a trivially rematerializable def to a list of regions at MinOccupancy
-  // that has the defined reg as a live-in.
-  DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
-
-  // Collect all trivially rematerializable VGPR instructions with a single def
-  // and single use outside the defining block into RematerializableInsts.
-  void collectRematerializableInstructions();
-
+  /// Useful information about a rematerializable instruction.
+  struct RematInstruction {
+    /// Single use of the rematerializable instruction's defined register,
+    /// located in a different block.
+    MachineInstr *UseMI;
+    /// Rematerialized version of \p DefMI, set in
+    /// PreRARematStage::rematerialize. Used for reverting rematerializations.
+    MachineInstr *RematMI;
+    /// Set of regions in which the rematerializable instruction's defined
+    /// register is a live-in.
+    SmallDenseSet<unsigned, 4> LiveInRegions;
+    /// Region containing the rematerializable instruction.
+    unsigned DefRegion;
+
+    RematInstruction(unsigned DefRegion, MachineInstr *UseMI)
+        : UseMI(UseMI), DefRegion(DefRegion) {}
+  };
+
+  /// Collects instructions to rematerialize.
+  MapVector<MachineInstr *, RematInstruction> Rematerializations;
+  /// Collect regions whose live-ins or register pressure will change due to
+  /// rematerializations.
+  DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
+  /// In case we need to rollback rematerializations, save lane masks for all
+  /// rematerialized registers in all regions in which they are live-ins.
+  DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
+  /// Target occupancy the stage estimates is reachable through
+  /// rematerialization. Greater than or equal to the pre-stage min occupancy.
+  unsigned TargetOcc;
+  /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
+  /// Smaller than or equal to the target occupancy.
+  unsigned AchievedOcc;
+  /// Whether the stage is attempting to increase occupancy in the abscence of
+  /// spilling.
+  bool IncreaseOccupancy;
+
+  /// Returns whether remat can reduce spilling or increase function occupancy
+  /// by 1 through rematerialization. If it can do one, collects instructions in
+  /// PreRARematStage::Rematerializations and sets the target occupancy in
+  /// PreRARematStage::TargetOccupancy.
+  bool canIncreaseOccupancyOrReduceSpill();
+
+  /// Whether the MI is trivially rematerializable and does not have any virtual
+  /// register use.
   bool isTriviallyReMaterializable(const MachineInstr &MI);
 
-  // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
-  // Attempt to reduce RP of VGPR by sinking trivially rematerializable
-  // instructions. Returns true if we were able to sink instruction(s).
-  bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                               const TargetInstrInfo *TII);
+  /// Rematerializes all instructions in PreRARematStage::Rematerializations
+  /// and stores the achieved occupancy after remat in
+  /// PreRARematStage::AchievedOcc.
+  void rematerialize();
+
+  /// If remat alone did not increase occupancy to the target one, rollbacks all
+  /// rematerializations and resets live-ins/RP in all regions impacted by the
+  /// stage to their pre-stage values.
+  void finalizeGCNSchedStage() override;
 
   /// \p Returns true if all the uses in \p InstToRemat defined at \p
   /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index b5e8e246825c7..0507b64c46192 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -496,44 +496,43 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
                             getReservedNumSGPRs(F));
 }
 
-unsigned GCNSubtarget::getBaseMaxNumVGPRs(
-    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
-  // Compute maximum number of VGPRs function can use using default/requested
-  // minimum number of waves per execution unit.
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+unsigned
+GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
+                                 std::pair<unsigned, unsigned> NumVGPRBounds,
+                                 bool UnifiedRF) const {
+  const auto &[Min, Max] = NumVGPRBounds;
 
   // Check if maximum number of VGPRs was explicitly requested using
   // "amdgpu-num-vgpr" attribute.
-  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
-    unsigned Requested =
-        F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
-
-    if (hasGFX90AInsts())
-      Requested *= 2;
-
-    // Make sure requested value is compatible with values implied by
-    // default/requested minimum/maximum number of waves per execution unit.
-    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
-      Requested = 0;
-    if (WavesPerEU.second && Requested &&
-        Requested < getMinNumVGPRs(WavesPerEU.second))
-      Requested = 0;
+  if (!F.hasFnAttribute("amdgpu-num-vgpr"))
+    return Max;
 
-    if (Requested)
-      MaxNumVGPRs = Requested;
-  }
+  unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
+  if (!Requested)
+    return Max;
+  if (UnifiedRF)
+    Requested *= 2;
 
-  return MaxNumVGPRs;
+  // Make sure requested value is inside the range of possible VGPR usage.
+  return std::clamp(Requested, Min, Max);
 }
 
 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
-  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
+  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
+  return getBaseMaxNumVGPRs(
+      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)},
+      hasGFX90AInsts());
+}
+
+unsigned GCNSubtarget::getMaxNumArchVGPRs(const Function &F) const {
+  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
+  return getBaseMaxNumVGPRs(
+      F, {getMinNumArchVGPRs(Waves.second), getMaxNumArchVGPRs(Waves.first)},
+      false);
 }
 
 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
-  const Function &F = MF.getFunction();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
+  return getMaxNumVGPRs(MF.getFunction());
 }
 
 void GCNSubtarget::adjustSchedDependency(
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 342b211199dca..29acb91afb82c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -38,12 +38,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // Following 2 enums are documented at:
   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
   enum class TrapHandlerAbi {
-    NONE   = 0x00,
+    NONE = 0x00,
     AMDHSA = 0x01,
   };
 
   enum class TrapID {
-    LLVMAMDHSATrap      = 0x02,
+    LLVMAMDHSATrap = 0x02,
     LLVMAMDHSADebugTrap = 0x03,
   };
 
@@ -269,24 +269,20 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
                const GCNTargetMachine &TM);
   ~GCNSubtarget() override;
 
-  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
-                                                   StringRef GPU, StringRef FS);
+  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
+                                                StringRef FS);
 
   /// Diagnose inconsistent subtarget features before attempting to codegen
   /// function \p F.
   void checkSubtargetFeatures(const Function &F) const;
 
-  const SIInstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
-  }
+  const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
 
   const SIFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
 
-  const SITargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
+  const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
 
   const SIRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
@@ -324,9 +320,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
-  Generation getGeneration() const {
-    return (Generation)Gen;
-  }
+  Generation getGeneration() const { return (Generation)Gen; }
 
   unsigned getMaxWaveScratchSize() const {
     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
@@ -347,9 +341,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
   }
 
-  int getLDSBankCount() const {
-    return LDSBankCount;
-  }
+  int getLDSBankCount() const { return LDSBankCount; }
 
   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
@@ -364,29 +356,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool supportsWGP() const { return getGeneration() >= GFX10; }
 
-  bool hasIntClamp() const {
-    return HasIntClamp;
-  }
+  bool hasIntClamp() const { return HasIntClamp; }
 
-  bool hasFP64() const {
-    return FP64;
-  }
+  bool hasFP64() const { return FP64; }
 
-  bool hasMIMG_R128() const {
-    return MIMG_R128;
-  }
+  bool hasMIMG_R128() const { return MIMG_R128; }
 
-  bool hasHWFP64() const {
-    return FP64;
-  }
+  bool hasHWFP64() const { return FP64; }
 
-  bool hasHalfRate64Ops() const {
-    return HalfRate64Ops;
-  }
+  bool hasHalfRate64Ops() const { return HalfRate64Ops; }
 
-  bool hasFullRate64Ops() const {
-    return FullRate64Ops;
-  }
+  bool hasFullRate64Ops() const { return FullRate64Ops; }
 
   bool hasAddr64() const {
     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
@@ -402,65 +382,37 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
-  bool hasFractBug() const {
-    return getGeneration() == SOUTHERN_ISLANDS;
-  }
+  bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
 
-  bool hasBFE() const {
-    return true;
-  }
+  bool hasBFE() const { return true; }
 
-  bool hasBFI() const {
-    return true;
-  }
+  bool hasBFI() const { return true; }
 
-  bool hasBFM() const {
-    return hasBFE();
-  }
+  bool hasBFM() const { return hasBFE(); }
 
-  bool hasBCNT(unsigned Size) const {
-    return true;
-  }
+  bool hasBCNT(unsigned Size) const { return true; }
 
-  bool hasFFBL() const {
-    return true;
-  }
+  bool hasFFBL() const { return true; }
 
-  bool hasFFBH() const {
-    return true;
-  }
+  bool hasFFBH() const { return true; }
 
-  bool hasMed3_16() const {
-    return getGeneration() >= AMDGPUSubtarget::GFX9;
-  }
+  bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
 
   bool hasMin3Max3_16() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  bool hasFmaMixInsts() const {
-    return HasFmaMixInsts;
-  }
+  bool hasFmaMixInsts() const { return HasFmaMixInsts; }
 
-  bool hasCARRY() const {
-    return true;
-  }
+  bool hasCARRY() const { return true; }
 
-  bool hasFMA() const {
-    return FMA;
-  }
+  bool hasFMA() const { return FMA; }
 
-  bool hasSwap() const {
-    return GFX9Insts;
-  }
+  bool hasSwap() const { return GFX9Insts; }
 
-  bool hasScalarPackInsts() const {
-    return GFX9Insts;
-  }
+  bool hasScalarPackInsts() const { return GFX9Insts; }
 
-  bool hasScalarMulHiInsts() const {
-    return GFX9Insts;
-  }
+  bool hasScalarMulHiInsts() const { return GFX9Insts; }
 
   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
 
@@ -475,9 +427,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// True if the offset field of DS instructions works as expected. On SI, the
   /// offset uses a 16-bit adder and does not always wrap properly.
-  bool hasUsableDSOffset() const {
-    return getGeneration() >= SEA_ISLANDS;
-  }
+  bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
 
   bool unsafeDSOffsetFoldingEnabled() const {
     return EnableUnsafeDSOffsetFolding;
@@ -490,14 +440,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// Extra wait hazard is needed in some cases before
   /// s_cbranch_vccnz/s_cbranch_vccz.
-  bool hasReadVCCZBug() const {
-    return getGeneration() <= SEA_ISLANDS;
-  }
+  bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
 
   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
-  bool partialVCCWritesUpdateVCCZ() const {
-    return getGeneration() >= GFX10;
-  }
+  bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
 
   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
   /// was written by a VALU instruction.
@@ -511,18 +457,14 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
-  bool hasRFEHazards() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
+  bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
 
   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
   unsigned getSetRegWaitStates() const {
     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
   }
 
-  bool dumpCode() const {
-    return DumpCode;
-  }
+  bool dumpCode() const { return DumpCode; }
 
   /// Return the amount of LDS that can be used that will not restrict the
   /// occupancy lower than WaveCount.
@@ -538,25 +480,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= AMDGPUSubtarget::GFX10;
   }
 
-  bool useFlatForGlobal() const {
-    return FlatForGlobal;
-  }
+  bool useFlatForGlobal() const { return FlatForGlobal; }
 
   /// \returns If target supports ds_read/write_b128 and user enables generation
   /// of ds_read/write_b128.
-  bool useDS128() const {
-    return CIInsts && EnableDS128;
-  }
+  bool useDS128() const { return CIInsts && EnableDS128; }
 
   /// \return If target supports ds_read/write_b96/128.
-  bool hasDS96AndDS128() const {
-    return CIInsts;
-  }
+  bool hasDS96AndDS128() const { return CIInsts; }
 
   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
-  bool haveRoundOpsF64() const {
-    return CIInsts;
-  }
+  bool haveRoundOpsF64() const { return CIInsts; }
 
   /// \returns If MUBUF instructions always perform range checking, even for
   /// buffer resources used for private memory access.
@@ -566,89 +500,55 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// \returns If target requires PRT Struct NULL support (zero result registers
   /// for sparse texture support).
-  bool usePRTStrictNull() const {
-    return EnablePRTStrictNull;
-  }
+  bool usePRTStrictNull() const { return EnablePRTStrictNull; }
 
-  bool hasAutoWaitcntBeforeBarrier() const {
-    return AutoWaitcntBeforeBarrier;
-  }
+  bool hasAutoWaitcntBeforeBarrier() const { return AutoWaitcntBeforeBarrier; }
 
   /// \returns true if the target supports backing off of s_barrier instructions
   /// when an exception is raised.
-  bool supportsBackOffBarrier() const {
-    return BackOffBarrier;
-  }
+  bool supportsBackOffBarrier() const { return BackOffBarrier; }
 
-  bool hasUnalignedBufferAccess() const {
-    return UnalignedBufferAccess;
-  }
+  bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; }
 
   bool hasUnalignedBufferAccessEnabled() const {
     return UnalignedBufferAccess && UnalignedAccessMode;
   }
 
-  bool hasUnalignedDSAccess() const {
-    return UnalignedDSAccess;
-  }
+  bool hasUnalignedDSAccess() const { return UnalignedDSAccess; }
 
   bool hasUnalignedDSAccessEnabled() const {
     return UnalignedDSAccess && UnalignedAccessMode;
   }
 
-  bool hasUnalignedScratchAccess() const {
-    return UnalignedScratchAccess;
-  }
+  bool hasUnalignedScratchAccess() const { return UnalignedScratchAccess; }
 
   bool hasUnalignedScratchAccessEnabled() const {
     return UnalignedScratchAccess && UnalignedAccessMode;
   }
 
-  bool hasUnalignedAccessMode() const {
-    return UnalignedAccessMode;
-  }
+  bool hasUnalignedAccessMode() const { return UnalignedAccessMode; }
 
-  bool hasApertureRegs() const {
-    return HasApertureRegs;
-  }
+  bool hasApertureRegs() const { return HasApertureRegs; }
 
-  bool isTrapHandlerEnabled() const {
-    return TrapHandler;
-  }
+  bool isTrapHandlerEnabled() const { return TrapHandler; }
 
-  bool isXNACKEnabled() const {
-    return TargetID.isXnackOnOrAny();
-  }
+  bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
 
-  bool isTgSplitEnabled() const {
-    return EnableTgSplit;
-  }
+  bool isTgSplitEnabled() const { return EnableTgSplit; }
 
-  bool isCuModeEnabled() const {
-    return EnableCuMode;
-  }
+  bool isCuModeEnabled() const { return EnableCuMode; }
 
   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
 
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
-  }
+  bool hasFlatAddressSpace() const { return FlatAddressSpace; }
 
-  bool hasFlatScrRegister() const {
-    return hasFlatAddressSpace();
-  }
+  bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
 
-  bool hasFlatInstOffsets() const {
-    return FlatInstOffsets;
-  }
+  bool hasFlatInstOffsets() const { return FlatInstOffsets; }
 
-  bool hasFlatGlobalInsts() const {
-    return FlatGlobalInsts;
-  }
+  bool hasFlatGlobalInsts() const { return FlatGlobalInsts; }
 
-  bool hasFlatScratchInsts() const {
-    return FlatScratchInsts;
-  }
+  bool hasFlatScratchInsts() const { return FlatScratchInsts; }
 
   // Check if target supports ST addressing mode with FLAT scratch instructions.
   // The ST addressing mode means no registers are used, either VGPR or SGPR,
@@ -659,30 +559,20 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
 
-  bool hasScalarFlatScratchInsts() const {
-    return ScalarFlatScratchInsts;
-  }
+  bool hasScalarFlatScratchInsts() const { return ScalarFlatScratchInsts; }
 
   bool enableFlatScratch() const {
     return flatScratchIsArchitected() ||
            (EnableFlatScratch && hasFlatScratchInsts());
   }
 
-  bool hasGlobalAddTidInsts() const {
-    return GFX10_BEncoding;
-  }
+  bool hasGlobalAddTidInsts() const { return GFX10_BEncoding; }
 
-  bool hasAtomicCSub() const {
-    return GFX10_BEncoding;
-  }
+  bool hasAtomicCSub() const { return GFX10_BEncoding; }
 
-  bool hasExportInsts() const {
-    return !hasGFX940Insts();
-  }
+  bool hasExportInsts() const { return !hasGFX940Insts(); }
 
-  bool hasVINTERPEncoding() const {
-    return GFX11Insts;
-  }
+  bool hasVINTERPEncoding() const { return GFX11Insts; }
 
   // DS_ADD_F64/DS_ADD_RTN_F64
   bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
@@ -691,162 +581,98 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= GFX9;
   }
 
-  bool hasFlatSegmentOffsetBug() const {
-    return HasFlatSegmentOffsetBug;
-  }
+  bool hasFlatSegmentOffsetBug() const { return HasFlatSegmentOffsetBug; }
 
-  bool hasFlatLgkmVMemCountInOrder() const {
-    return getGeneration() > GFX9;
-  }
+  bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
 
-  bool hasD16LoadStore() const {
-    return getGeneration() >= GFX9;
-  }
+  bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
 
   bool d16PreservesUnusedBits() const {
     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
   }
 
-  bool hasD16Images() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
+  bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
 
   /// Return if most LDS instructions have an m0 use that require m0 to be
   /// initialized.
-  bool ldsRequiresM0Init() const {
-    return getGeneration() < GFX9;
-  }
+  bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
 
   // True if the hardware rewinds and replays GWS operations if a wave is
   // preempted.
   //
   // If this is false, a GWS operation requires testing if a nack set the
   // MEM_VIOL bit, and repeating if so.
-  bool hasGWSAutoReplay() const {
-    return getGeneration() >= GFX9;
-  }
+  bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
 
   /// \returns if target has ds_gws_sema_release_all instruction.
-  bool hasGWSSemaReleaseAll() const {
-    return CIInsts;
-  }
+  bool hasGWSSemaReleaseAll() const { return CIInsts; }
 
   /// \returns true if the target has integer add/sub instructions that do not
   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
   /// for saturation.
-  bool hasAddNoCarry() const {
-    return AddNoCarryInsts;
-  }
+  bool hasAddNoCarry() const { return AddNoCarryInsts; }
 
   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
 
   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
 
-  bool hasUnpackedD16VMem() const {
-    return HasUnpackedD16VMem;
-  }
+  bool hasUnpackedD16VMem() const { return HasUnpackedD16VMem; }
 
   // Covers VS/PS/CS graphics shaders
   bool isMesaGfxShader(const Function &F) const {
     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
   }
 
-  bool hasMad64_32() const {
-    return getGeneration() >= SEA_ISLANDS;
-  }
+  bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
 
-  bool hasSDWAOmod() const {
-    return HasSDWAOmod;
-  }
+  bool hasSDWAOmod() const { return HasSDWAOmod; }
 
-  bool hasSDWAScalar() const {
-    return HasSDWAScalar;
-  }
+  bool hasSDWAScalar() const { return HasSDWAScalar; }
 
-  bool hasSDWASdst() const {
-    return HasSDWASdst;
-  }
+  bool hasSDWASdst() const { return HasSDWASdst; }
 
-  bool hasSDWAMac() const {
-    return HasSDWAMac;
-  }
+  bool hasSDWAMac() const { return HasSDWAMac; }
 
-  bool hasSDWAOutModsVOPC() const {
-    return HasSDWAOutModsVOPC;
-  }
+  bool hasSDWAOutModsVOPC() const { return HasSDWAOutModsVOPC; }
 
-  bool hasDLInsts() const {
-    return HasDLInsts;
-  }
+  bool hasDLInsts() const { return HasDLInsts; }
 
   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
 
-  bool hasDot1Insts() const {
-    return HasDot1Insts;
-  }
+  bool hasDot1Insts() const { return HasDot1Insts; }
 
-  bool hasDot2Insts() const {
-    return HasDot2Insts;
-  }
+  bool hasDot2Insts() const { return HasDot2Insts; }
 
-  bool hasDot3Insts() const {
-    return HasDot3Insts;
-  }
+  bool hasDot3Insts() const { return HasDot3Insts; }
 
-  bool hasDot4Insts() const {
-    return HasDot4Insts;
-  }
+  bool hasDot4Insts() const { return HasDot4Insts; }
 
-  bool hasDot5Insts() const {
-    return HasDot5Insts;
-  }
+  bool hasDot5Insts() const { return HasDot5Insts; }
 
-  bool hasDot6Insts() const {
-    return HasDot6Insts;
-  }
+  bool hasDot6Insts() const { return HasDot6Insts; }
 
-  bool hasDot7Insts() const {
-    return HasDot7Insts;
-  }
+  bool hasDot7Insts() const { return HasDot7Insts; }
 
-  bool hasDot8Insts() const {
-    return HasDot8Insts;
-  }
+  bool hasDot8Insts() const { return HasDot8Insts; }
 
-  bool hasDot9Insts() const {
-    return HasDot9Insts;
-  }
+  bool hasDot9Insts() const { return HasDot9Insts; }
 
-  bool hasDot10Insts() const {
-    return HasDot10Insts;
-  }
+  bool hasDot10Insts() const { return HasDot10Insts; }
 
-  bool hasDot11Insts() const {
-    return HasDot11Insts;
-  }
+  bool hasDot11Insts() const { return HasDot11Insts; }
 
-  bool hasDot12Insts() const {
-    return HasDot12Insts;
-  }
+  bool hasDot12Insts() const { return HasDot12Insts; }
 
-  bool hasDot13Insts() const {
-    return HasDot13Insts;
-  }
+  bool hasDot13Insts() const { return HasDot13Insts; }
 
-  bool hasMAIInsts() const {
-    return HasMAIInsts;
-  }
+  bool hasMAIInsts() const { return HasMAIInsts; }
 
-  bool hasFP8Insts() const {
-    return HasFP8Insts;
-  }
+  bool hasFP8Insts() const { return HasFP8Insts; }
 
   bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
 
-  bool hasPkFmacF16Inst() const {
-    return HasPkFmacF16Inst;
-  }
+  bool hasPkFmacF16Inst() const { return HasPkFmacF16Inst; }
 
   bool hasAtomicFMinFMaxF32GlobalInsts() const {
     return HasAtomicFMinFMaxF32GlobalInsts;
@@ -919,37 +745,23 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasDefaultComponentBroadcast;
   }
 
-  bool hasNoSdstCMPX() const {
-    return HasNoSdstCMPX;
-  }
+  bool hasNoSdstCMPX() const { return HasNoSdstCMPX; }
 
-  bool hasVscnt() const {
-    return HasVscnt;
-  }
+  bool hasVscnt() const { return HasVscnt; }
 
-  bool hasGetWaveIdInst() const {
-    return HasGetWaveIdInst;
-  }
+  bool hasGetWaveIdInst() const { return HasGetWaveIdInst; }
 
-  bool hasSMemTimeInst() const {
-    return HasSMemTimeInst;
-  }
+  bool hasSMemTimeInst() const { return HasSMemTimeInst; }
 
-  bool hasShaderCyclesRegister() const {
-    return HasShaderCyclesRegister;
-  }
+  bool hasShaderCyclesRegister() const { return HasShaderCyclesRegister; }
 
   bool hasShaderCyclesHiLoRegisters() const {
     return HasShaderCyclesHiLoRegisters;
   }
 
-  bool hasVOP3Literal() const {
-    return HasVOP3Literal;
-  }
+  bool hasVOP3Literal() const { return HasVOP3Literal; }
 
-  bool hasNoDataDepHazard() const {
-    return HasNoDataDepHazard;
-  }
+  bool hasNoDataDepHazard() const { return HasNoDataDepHazard; }
 
   bool vmemWriteNeedsExpWaitcnt() const {
     return getGeneration() < SEA_ISLANDS;
@@ -974,15 +786,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // dynamic realignment in common cases.
   Align getStackAlignment() const { return Align(16); }
 
-  bool enableMachineScheduler() const override {
-    return true;
-  }
+  bool enableMachineScheduler() const override { return true; }
 
   bool useAA() const override;
 
-  bool enableSubRegLiveness() const override {
-    return true;
-  }
+  bool enableSubRegLiveness() const override { return true; }
 
   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
@@ -991,9 +799,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 
   // XXX - Why is this here if it isn't in the default pass set?
-  bool enableEarlyIfConversion() const override {
-    return true;
-  }
+  bool enableEarlyIfConversion() const override { return true; }
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
@@ -1004,17 +810,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::getMaxNumUserSGPRs(*this);
   }
 
-  bool hasSMemRealTime() const {
-    return HasSMemRealTime;
-  }
+  bool hasSMemRealTime() const { return HasSMemRealTime; }
 
-  bool hasMovrel() const {
-    return HasMovrel;
-  }
+  bool hasMovrel() const { return HasMovrel; }
 
-  bool hasVGPRIndexMode() const {
-    return HasVGPRIndexMode;
-  }
+  bool hasVGPRIndexMode() const { return HasVGPRIndexMode; }
 
   bool useVGPRIndexMode() const;
 
@@ -1024,13 +824,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
 
-  bool hasScalarStores() const {
-    return HasScalarStores;
-  }
+  bool hasScalarStores() const { return HasScalarStores; }
 
-  bool hasScalarAtomics() const {
-    return HasScalarAtomics;
-  }
+  bool hasScalarAtomics() const { return HasScalarAtomics; }
 
   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
@@ -1041,60 +837,40 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
   bool hasPermLane64() const { return getGeneration() >= GFX11; }
 
-  bool hasDPP() const {
-    return HasDPP;
-  }
+  bool hasDPP() const { return HasDPP; }
 
-  bool hasDPPBroadcasts() const {
-    return HasDPP && getGeneration() < GFX10;
-  }
+  bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; }
 
   bool hasDPPWavefrontShifts() const {
     return HasDPP && getGeneration() < GFX10;
   }
 
-  bool hasDPP8() const {
-    return HasDPP8;
-  }
+  bool hasDPP8() const { return HasDPP8; }
 
-  bool hasDPALU_DPP() const {
-    return HasDPALU_DPP;
-  }
+  bool hasDPALU_DPP() const { return HasDPALU_DPP; }
 
   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
 
-  bool hasPackedFP32Ops() const {
-    return HasPackedFP32Ops;
-  }
+  bool hasPackedFP32Ops() const { return HasPackedFP32Ops; }
 
   // Has V_PK_MOV_B32 opcode
-  bool hasPkMovB32() const {
-    return GFX90AInsts;
-  }
+  bool hasPkMovB32() const { return GFX90AInsts; }
 
   bool hasFmaakFmamkF32Insts() const {
     return getGeneration() >= GFX10 || hasGFX940Insts();
   }
 
-  bool hasImageInsts() const {
-    return HasImageInsts;
-  }
+  bool hasImageInsts() const { return HasImageInsts; }
 
-  bool hasExtendedImageInsts() const {
-    return HasExtendedImageInsts;
-  }
+  bool hasExtendedImageInsts() const { return HasExtendedImageInsts; }
 
-  bool hasR128A16() const {
-    return HasR128A16;
-  }
+  bool hasR128A16() const { return HasR128A16; }
 
   bool hasA16() const { return HasA16; }
 
   bool hasG16() const { return HasG16; }
 
-  bool hasOffset3fBug() const {
-    return HasOffset3fBug;
-  }
+  bool hasOffset3fBug() const { return HasOffset3fBug; }
 
   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
 
@@ -1116,17 +892,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::getNSAMaxSize(*this, HasSampler);
   }
 
-  bool hasGFX10_AEncoding() const {
-    return GFX10_AEncoding;
-  }
+  bool hasGFX10_AEncoding() const { return GFX10_AEncoding; }
 
-  bool hasGFX10_BEncoding() const {
-    return GFX10_BEncoding;
-  }
+  bool hasGFX10_BEncoding() const { return GFX10_BEncoding; }
 
-  bool hasGFX10_3Insts() const {
-    return GFX10_3Insts;
-  }
+  bool hasGFX10_3Insts() const { return GFX10_3Insts; }
 
   bool hasMadF16() const;
 
@@ -1134,21 +904,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasLshlAddB64() const { return GFX940Insts; }
 
-  bool enableSIScheduler() const {
-    return EnableSIScheduler;
-  }
+  bool enableSIScheduler() const { return EnableSIScheduler; }
 
-  bool loadStoreOptEnabled() const {
-    return EnableLoadStoreOpt;
-  }
+  bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
 
-  bool hasSGPRInitBug() const {
-    return SGPRInitBug;
-  }
+  bool hasSGPRInitBug() const { return SGPRInitBug; }
 
-  bool hasUserSGPRInit16Bug() const {
-    return UserSGPRInit16Bug && isWave32();
-  }
+  bool hasUserSGPRInit16Bug() const { return UserSGPRInit16Bug && isWave32(); }
 
   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
 
@@ -1156,18 +918,14 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return NegativeUnalignedScratchOffsetBug;
   }
 
-  bool hasMFMAInlineLiteralBug() const {
-    return HasMFMAInlineLiteralBug;
-  }
+  bool hasMFMAInlineLiteralBug() const { return HasMFMAInlineLiteralBug; }
 
   bool has12DWordStoreHazard() const {
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
 
   // \returns true if the subtarget supports DWORDX3 load/store instructions.
-  bool hasDwordx3LoadStores() const {
-    return CIInsts;
-  }
+  bool hasDwordx3LoadStores() const { return CIInsts; }
 
   bool hasReadM0MovRelInterpHazard() const {
     return getGeneration() == AMDGPUSubtarget::GFX9;
@@ -1186,39 +944,23 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() == AMDGPUSubtarget::GFX9;
   }
 
-  bool hasVcmpxPermlaneHazard() const {
-    return HasVcmpxPermlaneHazard;
-  }
+  bool hasVcmpxPermlaneHazard() const { return HasVcmpxPermlaneHazard; }
 
-  bool hasVMEMtoScalarWriteHazard() const {
-    return HasVMEMtoScalarWriteHazard;
-  }
+  bool hasVMEMtoScalarWriteHazard() const { return HasVMEMtoScalarWriteHazard; }
 
-  bool hasSMEMtoVectorWriteHazard() const {
-    return HasSMEMtoVectorWriteHazard;
-  }
+  bool hasSMEMtoVectorWriteHazard() const { return HasSMEMtoVectorWriteHazard; }
 
-  bool hasLDSMisalignedBug() const {
-    return LDSMisalignedBug && !EnableCuMode;
-  }
+  bool hasLDSMisalignedBug() const { return LDSMisalignedBug && !EnableCuMode; }
 
-  bool hasInstFwdPrefetchBug() const {
-    return HasInstFwdPrefetchBug;
-  }
+  bool hasInstFwdPrefetchBug() const { return HasInstFwdPrefetchBug; }
 
-  bool hasVcmpxExecWARHazard() const {
-    return HasVcmpxExecWARHazard;
-  }
+  bool hasVcmpxExecWARHazard() const { return HasVcmpxExecWARHazard; }
 
-  bool hasLdsBranchVmemWARHazard() const {
-    return HasLdsBranchVmemWARHazard;
-  }
+  bool hasLdsBranchVmemWARHazard() const { return HasLdsBranchVmemWARHazard; }
 
   // Shift amount of a 64 bit shift cannot be a highest allocated register
   // if also at the end of the allocation block.
-  bool hasShift64HighRegBug() const {
-    return GFX90AInsts && !GFX940Insts;
-  }
+  bool hasShift64HighRegBug() const { return GFX90AInsts && !GFX940Insts; }
 
   // Has one cycle hazard on transcendental instruction feeding a
   // non transcendental VALU.
@@ -1232,13 +974,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
 
   // Does not have HW interlocs for VALU writing and then reading SGPRs.
-  bool hasVDecCoExecHazard() const {
-    return GFX940Insts;
-  }
+  bool hasVDecCoExecHazard() const { return GFX940Insts; }
 
-  bool hasNSAtoVMEMBug() const {
-    return HasNSAtoVMEMBug;
-  }
+  bool hasNSAtoVMEMBug() const { return HasNSAtoVMEMBug; }
 
   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
 
@@ -1308,9 +1046,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// Returns true if the target supports
   /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
   /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
-  bool hasLDSLoadB96_B128() const {
-    return hasGFX950Insts();
-  }
+  bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
 
   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
 
@@ -1341,17 +1077,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool hasPermlane32Swap() const { return HasPermlane32Swap; }
   bool hasAshrPkInsts() const { return HasAshrPkInsts; }
 
-  bool hasMinimum3Maximum3F32() const {
-    return HasMinimum3Maximum3F32;
-  }
+  bool hasMinimum3Maximum3F32() const { return HasMinimum3Maximum3F32; }
 
-  bool hasMinimum3Maximum3F16() const {
-    return HasMinimum3Maximum3F16;
-  }
+  bool hasMinimum3Maximum3F16() const { return HasMinimum3Maximum3F16; }
 
-  bool hasMinimum3Maximum3PKF16() const {
-    return HasMinimum3Maximum3PKF16;
-  }
+  bool hasMinimum3Maximum3PKF16() const { return HasMinimum3Maximum3PKF16; }
 
   /// \returns The maximum number of instructions that can be enclosed in an
   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
@@ -1368,13 +1098,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
-  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
-  /// be achieved when the only function running on a CU is \p F, each workgroup
-  /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
-  /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
-  /// range, so this returns a range as well.
-  ///
-  /// Note that occupancy can be affected by the scratch allocation as well, but
+  /// Return occupancy for the given function. Used LDS and a number of
+  /// registers if provided.
+  /// Note, occupancy can be affected by the scratch allocation as well, but
   /// we do not have enough information to compute it.
   std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
                                                  unsigned LDSSize = 0,
@@ -1402,9 +1128,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// \returns true if the machine has merged shaders in which s0-s7 are
   /// reserved by the hardware and user SGPRs start at s8
-  bool hasMergedShaders() const {
-    return getGeneration() >= GFX9;
-  }
+  bool hasMergedShaders() const { return getGeneration() >= GFX9; }
 
   // \returns true if the target supports the pre-NGG legacy geometry path.
   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
@@ -1547,16 +1271,30 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
   }
 
+  /// \returns the minimum number of ArchVGPRs that will prevent achieving more
+  /// than the specified number of waves \p WavesPerEU.
+  unsigned getMinNumArchVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumArchVGPRs(this, WavesPerEU);
+  }
+
   /// \returns the maximum number of VGPRs that can be used and still achieved
   /// at least the specified number of waves \p WavesPerEU.
   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
   }
 
-  /// \returns max num VGPRs. This is the common utility function
-  /// called by MachineFunction and Function variants of getMaxNumVGPRs.
+  /// \returns the maximum number of ArchVGPRs that can be used and still
+  /// achieve at least the specified number of waves \p WavesPerEU.
+  unsigned getMaxNumArchVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+  }
+
+  /// \returns max num VGPRs. This is the common utility function called by
+  /// MachineFunction and Function variants of getMaxNum[Arch]VGPRs.
   unsigned getBaseMaxNumVGPRs(const Function &F,
-                              std::pair<unsigned, unsigned> WavesPerEU) const;
+                              std::pair<unsigned, unsigned> NumVGPRBounds,
+                              bool UnifiedRF) const;
+
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p F, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
@@ -1567,9 +1305,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// unit requirement.
   unsigned getMaxNumVGPRs(const Function &F) const;
 
-  unsigned getMaxNumAGPRs(const Function &F) const {
-    return getMaxNumVGPRs(F);
-  }
+  /// Returns the maximum number of ArchVGPRs that meets number of waves per
+  /// execution unit requirement for function \p F, or number of ArchVGPRs
+  /// explicitly requested using "amdgpu-num-vgpr" attribute attached to
+  /// function \p F.
+  unsigned getMaxNumArchVGPRs(const Function &F) const;
+
+  unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p MF, or number of VGPRs explicitly
@@ -1581,13 +1323,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// unit requirement.
   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 
-  bool isWave32() const {
-    return getWavefrontSize() == 32;
-  }
+  bool isWave32() const { return getWavefrontSize() == 32; }
 
-  bool isWave64() const {
-    return getWavefrontSize() == 64;
-  }
+  bool isWave64() const { return getWavefrontSize() == 64; }
 
   /// Returns if the wavesize of this subtarget is known reliable. This is false
   /// only for the a default target-cpu that does not have an explicit
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 59afcbed35294..5f229b310f686 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1240,28 +1240,36 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
   return 5;
 }
 
-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getBaseMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                            unsigned TotalNumVGPRs,
+                            unsigned NumAddressableVGPRs) {
   assert(WavesPerEU != 0);
-
   unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
   if (WavesPerEU >= MaxWavesPerEU)
     return 0;
+  unsigned MinWavesPerEU =
+      getNumWavesPerEUWithNumVGPRs(STI, NumAddressableVGPRs);
+  if (WavesPerEU < MinWavesPerEU)
+    return getMinNumVGPRs(STI, MinWavesPerEU);
 
-  unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
-  unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
   unsigned Granule = getVGPRAllocGranule(STI);
-  unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
-
-  if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
+  unsigned MaxNumVGPRs = alignDown(TotalNumVGPRs / WavesPerEU, Granule);
+  if (MaxNumVGPRs == alignDown(TotalNumVGPRs / MaxWavesPerEU, Granule))
     return 0;
+  unsigned MaxNumVGPRsNext =
+      alignDown(TotalNumVGPRs / (WavesPerEU + 1), Granule);
+  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
+  return std::min(MinNumVGPRs, NumAddressableVGPRs);
+}
 
-  unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
-  if (WavesPerEU < MinWavesPerEU)
-    return getMinNumVGPRs(STI, MinWavesPerEU);
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+  return getBaseMinNumVGPRs(STI, WavesPerEU, getTotalNumVGPRs(STI),
+                            getAddressableNumVGPRs(STI));
+}
 
-  unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
-  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
-  return std::min(MinNumVGPRs, AddrsableNumVGPRs);
+unsigned getMinNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+  unsigned TotNumArchVGPRs = getAddressableNumArchVGPRs(STI);
+  return getBaseMinNumVGPRs(STI, WavesPerEU, TotNumArchVGPRs, TotNumArchVGPRs);
 }
 
 unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
@@ -1273,6 +1281,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
 
+unsigned getMaxNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+  return alignDown(getAddressableNumArchVGPRs(STI) / WavesPerEU,
+                   getVGPRAllocGranule(STI));
+}
+
 unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
                                  std::optional<bool> EnableWavefrontSize32) {
   return getGranulatedNumRegisterBlocks(
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 67bebfb3418d5..ebcbd0c07506b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -318,14 +318,29 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
 /// \returns Addressable number of VGPRs for given subtarget \p STI.
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
 
+/// Returns the minimum number of VGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p STI with \p TotNumVGPRs
+/// VGPRs available and \p NumAddressableVGPRs addressable VGPRs.
+unsigned getBaseMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                            unsigned TotalNumVGPRs,
+                            unsigned NumAddressableVGPRs);
+
 /// \returns Minimum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
 unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
+/// \returns Minimum number of ArchVGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMinNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+
 /// \returns Maximum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
 unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
+/// \returns Maximum number of ArchVGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMaxNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+
 /// \returns Number of waves reachable for a given \p NumVGPRs usage for given
 /// subtarget \p STI.
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
new file mode 100644
index 0000000000000..3c15201cd698e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
@@ -0,0 +1,221 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
+
+--- |
+  ; User-requested maximum number of VGPRs need to be taken into account by
+  ; the scheduler's rematerialization stage. Register usage above that number
+  ; is considered like spill; occupancy is "inadvertently" increased when
+  ; eliminating spill.
+  define void @small_num_vgprs_as_spill() "amdgpu-num-vgpr"="28" {
+    ret void
+  }
+
+  ; Min/Max occupancy is 8, the scheduler's rematerialization stage should not
+  ; try to rematerialize instructions.
+  define void @dont_remat_at_max_occ() "amdgpu-flat-work-group-size"="1024,1024" {
+    ret void
+  }
+---
+name:            small_num_vgprs_as_spill
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: small_num_vgprs_as_spill
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+
+  bb.1:
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33
+
+    S_ENDPGM 0
+...
+---
+name:            dont_remat_at_max_occ
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: dont_remat_at_max_occ
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode,
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode,
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode,
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode,
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode,
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode,
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode,
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode,
+
+  bb.1:
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31
+
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
index 5dc6d2ee8f695..371753801d1a3 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
@@ -17,7 +17,7 @@ machineFunctionInfo:
   isEntryFunction: true
 body:             |
   ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_region_1: IsSSA, NoPHIs, TracksLiveness
-  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
+  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
   ; DEBUG-NEXT: ********** MI Scheduling **********
   ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_region_1:%bb.2
   ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -89,7 +89,7 @@ machineFunctionInfo:
   isEntryFunction: true
 body:             |
   ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_regions_2: IsSSA, NoPHIs, TracksLiveness
-  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
+  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
   ; DEBUG-NEXT: ********** MI Scheduling **********
   ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_regions_2:%bb.2
   ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 9991cb1837e01..1170ec0b88124 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -725,11 +725,11 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -1168,18 +1168,18 @@ body:             |
   ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
   ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
   ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
   ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
   ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
@@ -2956,11 +2956,11 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3134,11 +3134,11 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3324,11 +3324,11 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -4920,6 +4920,604 @@ body:             |
     S_ENDPGM 0
 ...
 ---
+name:            test_occ_1_sink_for_spill
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: test_occ_1_sink_for_spill
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    %255:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+    %256:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+
+  bb.1:
+    successors: %bb.2
+
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+    %82:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+    %83:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+    %84:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+    %85:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+    %86:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+    %87:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+    %88:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+    %89:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+    %90:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+    %91:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+    %92:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+    %93:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+    %94:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+    %95:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+    %96:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+    %97:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+    %98:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+    %99:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+    %100:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+    %101:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+    %102:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+    %103:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+    %104:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+    %105:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+    %106:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+    %107:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+    %108:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+    %109:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+    %110:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+    %111:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+    %112:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+    %113:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+    %114:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+    %115:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+    %116:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+    %117:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+    %118:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+    %119:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+    %120:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+    %121:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+    %122:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+    %123:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+    %124:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+    %125:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+    %126:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+    %127:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+    %128:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+    %129:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+    %130:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+    %131:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+    %132:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+    %133:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+    %134:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+    %135:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+    %136:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+    %137:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+    %138:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+    %139:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+    %140:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+    %141:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+    %142:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+    %143:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+    %144:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+    %145:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+    %146:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+    %147:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+    %148:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+    %149:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+    %150:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+    %151:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+    %152:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+    %153:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+    %154:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+    %155:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+    %156:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+    %157:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+    %158:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+    %159:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+    %160:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+    %161:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+    %162:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+    %163:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+    %164:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+    %165:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+    %166:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+    %167:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+    %168:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+    %169:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+    %170:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+    %171:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+    %172:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+    %173:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+    %174:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+    %175:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+    %176:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+    %177:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+    %178:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+    %179:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+    %180:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+    %181:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+    %182:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+    %183:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+    %184:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+    %185:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+    %186:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+    %187:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+    %188:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+    %189:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+    %190:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+    %191:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+    %192:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+    %193:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+    %194:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+    %195:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+    %196:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+    %197:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+    %198:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+    %199:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+    %200:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+    %201:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+    %202:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+    %203:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+    %204:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+    %205:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+    %206:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+    %207:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+    %208:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+    %209:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+    %210:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+    %211:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+    %212:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+    %213:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+    %214:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+    %215:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+    %216:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+    %217:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+    %218:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+    %219:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+    %220:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+    %221:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+    %222:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+    %223:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+    %224:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+    %225:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+    %226:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+    %227:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+    %228:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+    %229:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+    %230:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+    %231:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+    %232:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+    %233:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+    %234:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+    %235:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+    %236:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+    %237:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+    %238:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+    %239:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+    %240:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+    %241:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+    %242:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+    %243:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+    %244:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+    %245:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+    %246:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+    %247:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+    %248:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+    %249:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+    %250:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+    %251:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+    %252:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+    %253:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+    %254:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+
+  bb.2:
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
+    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
+    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
+    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
+    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
+    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
+    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
+    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
+    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
+    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
+    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
+    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
+    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
+    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
+    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
+    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
+    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
+    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
+    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
+    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
+    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
+    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
+    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254
+
+    S_NOP 0, implicit %255, implicit %256
+
+    S_ENDPGM 0
+...
+---
 name:            test_no_sink_two_subregs_in_def_block
 tracksRegLiveness: true
 machineFunctionInfo:
@@ -5544,12 +6142,12 @@ body:             |
     S_ENDPGM 0
 ...
 ---
-name:            test_occ_9_no_sink_one_def_of_undef_subreg
+name:            test_occ_7_sink_one_def_of_undef_subreg_for_8
 tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: test_occ_9_no_sink_one_def_of_undef_subreg
+  ; GFX908-LABEL: name: test_occ_7_sink_one_def_of_undef_subreg_for_8
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-NEXT: {{  $}}
@@ -5576,16 +6174,22 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
@@ -5597,7 +6201,13 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1
@@ -5625,17 +6235,24 @@ body:             |
     %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
     %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
     %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-    undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    undef %32.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
 
   bb.1:
     successors: %bb.2
 
-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-    S_NOP 0, implicit %24
+    S_NOP 0, implicit %0
 
   bb.2:
 
-    S_NOP 0, implicit %23.sub1
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
     S_NOP 0, implicit %4, implicit %5
@@ -5647,7 +6264,12 @@ body:             |
     S_NOP 0, implicit %16, implicit %17
     S_NOP 0, implicit %18, implicit %19
     S_NOP 0, implicit %20, implicit %21
-    S_NOP 0, implicit %22
+    S_NOP 0, implicit %22, implicit %23
+    S_NOP 0, implicit %24, implicit %25
+    S_NOP 0, implicit %26, implicit %27
+    S_NOP 0, implicit %28, implicit %29
+    S_NOP 0, implicit %30, implicit %31
+    S_NOP 0, implicit %32.sub1
     S_ENDPGM 0
 ...
 ---
@@ -5866,1227 +6488,673 @@ body:             |
     S_NOP 0, implicit %22
     S_ENDPGM 0
 ...
-
 ---
-name:            test_occ_8_physreg_use
+name:            test_live_through_occ_7_sink_for_8
 tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: test_occ_8_physreg_use
+  ; GFX908-LABEL: name: test_live_through_occ_7_sink_for_8
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
-  ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr8, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr9, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_7]], implicit [[V_CVT_I32_F32_e32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_8]], implicit [[V_CVT_I32_F32_e32_16]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_9]], implicit [[V_CVT_I32_F32_e32_17]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_10]], implicit [[V_CVT_I32_F32_e32_18]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_26]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_27]], implicit [[V_CVT_I32_F32_e32_28]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]]
+  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2:
+  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.3:
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
-    liveins: $vgpr0, $sgpr0_sgpr1
+    successors: %bb.1
 
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %2:vgpr_32(s32) = COPY $vgpr0
-    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
-    undef %5.sub1:sreg_64 = S_MOV_B32 0
-    %5.sub0:sreg_64 = COPY %3.sub1
-    $vgpr8 = IMPLICIT_DEF
-    $vgpr9 = IMPLICIT_DEF
-    %11:vgpr_32 = IMPLICIT_DEF
-    %12:vgpr_32 = IMPLICIT_DEF
-    %13:vgpr_32 = IMPLICIT_DEF
-    %14:vgpr_32 = IMPLICIT_DEF
-    %15:vgpr_32 = IMPLICIT_DEF
-    %16:vgpr_32 = IMPLICIT_DEF
-    %17:vgpr_32 = IMPLICIT_DEF
-    %18:vgpr_32 = IMPLICIT_DEF
-    %19:vgpr_32 = IMPLICIT_DEF
-    %20:vgpr_32 = IMPLICIT_DEF
-    %21:vgpr_32 = IMPLICIT_DEF
-    %22:vgpr_32 = IMPLICIT_DEF
-    %23:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %26:vgpr_32 = IMPLICIT_DEF
-    %27:vgpr_32 = IMPLICIT_DEF
-    %28:vgpr_32 = IMPLICIT_DEF
-    %29:vgpr_32 = IMPLICIT_DEF
-    %30:vgpr_32 = IMPLICIT_DEF
-    %31:vgpr_32 = IMPLICIT_DEF
-    %32:vgpr_32 = IMPLICIT_DEF
-    %33:vgpr_32 = IMPLICIT_DEF
-    %34:vgpr_32 = IMPLICIT_DEF
-    %35:vgpr_32 = IMPLICIT_DEF
-    %36:vgpr_32 = IMPLICIT_DEF
-    %37:vgpr_32 = IMPLICIT_DEF
-    %38:vgpr_32 = IMPLICIT_DEF
-    %39:vgpr_32 = IMPLICIT_DEF
-    %40:vgpr_32 = IMPLICIT_DEF
-    %41:vgpr_32 = IMPLICIT_DEF
-    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr8, implicit $exec, implicit $mode
-    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr9, implicit $exec, implicit $mode
-    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
-    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
-    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
-    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
-    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
-    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
-    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
-    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
-    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
-    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
-    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
-    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
-    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
-    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
-    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
-    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
-    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
-    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
-    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
-    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
-    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
-    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
-    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
-    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
-    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
-    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
-    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
-    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
-    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
-    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
-    S_BRANCH %bb.4
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
 
-  bb.4:
+  bb.1:
+    successors: %bb.2
 
-    S_NOP 0, implicit %50, implicit %60, implicit %20
-    S_NOP 0, implicit %51, implicit %61, implicit %21
-    S_NOP 0, implicit %52, implicit %62
-    S_NOP 0, implicit %53, implicit %63
-    S_NOP 0, implicit %54, implicit %64
-    S_NOP 0, implicit %55, implicit %65
-    S_NOP 0, implicit %56, implicit %66
-    S_NOP 0, implicit %57, implicit %67
-    S_NOP 0, implicit %58, implicit %68
-    S_NOP 0, implicit %59, implicit %69
-    S_NOP 0, implicit %70, implicit %71
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %74, implicit %75
-    S_NOP 0, implicit %76, implicit %77
-    S_NOP 0, implicit %78, implicit %79
-    S_NOP 0, implicit %80
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+
+  bb.2:
+    successors: %bb.3
+
+    S_NOP 0, implicit %2, implicit %18
+    S_NOP 0, implicit %3, implicit %19
+    S_NOP 0, implicit %4, implicit %20
+    S_NOP 0, implicit %5, implicit %21
+    S_NOP 0, implicit %6, implicit %22
+    S_NOP 0, implicit %7, implicit %23
+    S_NOP 0, implicit %8, implicit %24
+    S_NOP 0, implicit %9, implicit %25
+    S_NOP 0, implicit %10, implicit %26
+    S_NOP 0, implicit %11, implicit %27
+    S_NOP 0, implicit %12, implicit %28
+    S_NOP 0, implicit %13, implicit %29
+    S_NOP 0, implicit %14, implicit %30
+    S_NOP 0, implicit %15, implicit %31
+    S_NOP 0, implicit %16, implicit %32
+    S_NOP 0, implicit %33
+
+  bb.3:
+    S_NOP 0, implicit %0, implicit %1
     S_ENDPGM 0
 ...
-
 ---
-name:            test_occ_8_exec_use
+name:            test_remat_over_exec_modif
 tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: test_occ_8_exec_use
+  ; GFX908-LABEL: name: test_remat_over_exec_modif
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GFX908-NEXT:   liveins: $sgpr2_sgpr3
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
-  ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT:   %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   %save_exec:sreg_64 = S_MOV_B64 $exec
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   $exec = S_MOV_B64 %new_exec
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
-  ; GFX908-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 255
-  ; GFX908-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 [[S_MOV_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF29]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF30]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_10]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_7]], implicit [[V_CVT_I32_F32_e32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_8]], implicit [[V_CVT_I32_F32_e32_16]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_26]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_27]]
-  ; GFX908-NEXT:   $exec = S_MOV_B64 [[S_AND_SAVEEXEC_B64_]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+  ; GFX908-NEXT:   $exec = S_MOV_B64 %save_exec
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
-    liveins: $vgpr0, $sgpr0_sgpr1
+    liveins: $sgpr2_sgpr3
 
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %2:vgpr_32(s32) = COPY $vgpr0
-    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
-    undef %5.sub1:sreg_64 = S_MOV_B32 0
-    %5.sub0:sreg_64 = COPY %3.sub1
-    $vgpr8 = IMPLICIT_DEF
-    $vgpr9 = IMPLICIT_DEF
-    %11:vgpr_32 = IMPLICIT_DEF
-    %12:vgpr_32 = IMPLICIT_DEF
-    %13:vgpr_32 = IMPLICIT_DEF
-    %14:vgpr_32 = IMPLICIT_DEF
-    %15:vgpr_32 = IMPLICIT_DEF
-    %16:vgpr_32 = IMPLICIT_DEF
-    %17:vgpr_32 = IMPLICIT_DEF
-    %18:vgpr_32 = IMPLICIT_DEF
-    %19:vgpr_32 = IMPLICIT_DEF
-    %20:vgpr_32 = IMPLICIT_DEF
-    %21:vgpr_32 = IMPLICIT_DEF
-    %22:vgpr_32 = IMPLICIT_DEF
-    %23:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %26:vgpr_32 = IMPLICIT_DEF
-    %27:vgpr_32 = IMPLICIT_DEF
-    %28:vgpr_32 = IMPLICIT_DEF
-    %29:vgpr_32 = IMPLICIT_DEF
-    %30:vgpr_32 = IMPLICIT_DEF
-    %31:vgpr_32 = IMPLICIT_DEF
-    %32:vgpr_32 = IMPLICIT_DEF
-    %33:vgpr_32 = IMPLICIT_DEF
-    %34:vgpr_32 = IMPLICIT_DEF
-    %35:vgpr_32 = IMPLICIT_DEF
-    %36:vgpr_32 = IMPLICIT_DEF
-    %37:vgpr_32 = IMPLICIT_DEF
-    %38:vgpr_32 = IMPLICIT_DEF
-    %39:vgpr_32 = IMPLICIT_DEF
-    %40:vgpr_32 = IMPLICIT_DEF
-    %41:vgpr_32 = IMPLICIT_DEF
-    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
-    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
-    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
-    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
-    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
-    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
-    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
-    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
-    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
-    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
-    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
-    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
-    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
-    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
-    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
-    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
-    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
-    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
-    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
-    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
-    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
-    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
-    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
-    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
-    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
-    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
-    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
-    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
-    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
-    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
-    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
-    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
-    S_BRANCH %bb.4
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
 
-  bb.4:
+    %save_exec:sreg_64 = S_MOV_B64 $exec
+    %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+    $exec = S_MOV_B64 %new_exec
 
-    %100:sreg_64 = S_MOV_B64 255
-    %101:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %100, implicit-def $exec, implicit-def $scc, implicit $exec
-    S_NOP 0, implicit %50, implicit %60, implicit %20
-    S_NOP 0, implicit %51, implicit %61, implicit %21
-    S_NOP 0, implicit %52, implicit %62
-    S_NOP 0, implicit %53, implicit %63
-    S_NOP 0, implicit %54, implicit %64
-    S_NOP 0, implicit %55, implicit %65
-    S_NOP 0, implicit %56, implicit %66
-    S_NOP 0, implicit %57, implicit %67
-    S_NOP 0, implicit %58, implicit %68
-    S_NOP 0, implicit %59, implicit %69
-    S_NOP 0, implicit %70, implicit %71
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %74, implicit %75
-    S_NOP 0, implicit %76, implicit %77
-    S_NOP 0, implicit %78, implicit %79
-    S_NOP 0, implicit %80
-    $exec = S_MOV_B64 %101:sreg_64_xexec
+  bb.1:
+
+    S_NOP 0, implicit %0, implicit %16
+    S_NOP 0, implicit %1, implicit %17
+    S_NOP 0, implicit %2, implicit %18
+    S_NOP 0, implicit %3, implicit %19
+    S_NOP 0, implicit %4, implicit %20
+    S_NOP 0, implicit %5, implicit %21
+    S_NOP 0, implicit %6, implicit %22
+    S_NOP 0, implicit %7, implicit %23
+    S_NOP 0, implicit %8, implicit %24
+    S_NOP 0, implicit %9, implicit %25
+    S_NOP 0, implicit %10, implicit %26
+    S_NOP 0, implicit %11, implicit %27
+    S_NOP 0, implicit %12, implicit %28
+    S_NOP 0, implicit %13, implicit %29
+    S_NOP 0, implicit %14, implicit %30
+    S_NOP 0, implicit %15, implicit %31
+    S_NOP 0, implicit %32
+
+    $exec = S_MOV_B64 %save_exec
     S_ENDPGM 0
 ...
-
 ---
-name:            remat_virtual_vgpr_occ_6
+name:            test_rollback_remat_defregion_above_target
 tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: remat_virtual_vgpr_occ_6
+  ; GFX908-LABEL: name: test_rollback_remat_defregion_above_target
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT: bb.1:
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF22]], implicit [[DEF27]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF23]], implicit [[DEF28]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF24]], implicit [[DEF29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF30]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]]
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
-    liveins: $vgpr0, $sgpr0_sgpr1
+    successors: %bb.1
 
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %2:vgpr_32(s32) = COPY $vgpr0
-    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
-    undef %5.sub1:sreg_64 = S_MOV_B32 0
-    %5.sub0:sreg_64 = COPY %3.sub1
-    %10:vgpr_32 = IMPLICIT_DEF
-    %11:vgpr_32 = IMPLICIT_DEF
-    %12:vgpr_32 = IMPLICIT_DEF
-    %13:vgpr_32 = IMPLICIT_DEF
-    %14:vgpr_32 = IMPLICIT_DEF
-    %15:vgpr_32 = IMPLICIT_DEF
-    %16:vgpr_32 = IMPLICIT_DEF
-    %17:vgpr_32 = IMPLICIT_DEF
-    %18:vgpr_32 = IMPLICIT_DEF
-    %19:vgpr_32 = IMPLICIT_DEF
-    %20:vgpr_32 = IMPLICIT_DEF
-    %21:vgpr_32 = IMPLICIT_DEF
-    %22:vgpr_32 = IMPLICIT_DEF
-    %23:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %26:vgpr_32 = IMPLICIT_DEF
-    %27:vgpr_32 = IMPLICIT_DEF
-    %28:vgpr_32 = IMPLICIT_DEF
-    %29:vgpr_32 = IMPLICIT_DEF
-    %30:vgpr_32 = IMPLICIT_DEF
-    %31:vgpr_32 = IMPLICIT_DEF
-    %32:vgpr_32 = IMPLICIT_DEF
-    %33:vgpr_32 = IMPLICIT_DEF
-    %34:vgpr_32 = IMPLICIT_DEF
-    %35:vgpr_32 = IMPLICIT_DEF
-    %36:vgpr_32 = IMPLICIT_DEF
-    %37:vgpr_32 = IMPLICIT_DEF
-    %38:vgpr_32 = IMPLICIT_DEF
-    %39:vgpr_32 = IMPLICIT_DEF
-    %40:vgpr_32 = IMPLICIT_DEF
-    %41:vgpr_32 = IMPLICIT_DEF
-    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
-    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
-    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
-    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
-    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
-    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
-    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
-    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
-    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
-    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
-    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
-    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
-    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
-    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
-    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
-    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
-    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
-    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
-    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
-    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
-    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
-    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
-    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
-    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
-    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
-    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
-    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
-    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
-    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
-    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
-    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
-    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
-    S_BRANCH %bb.4
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+
+  bb.1:
+    successors: %bb.2
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31,  implicit %32
+
+  bb.2:
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31
 
-  bb.4:
-    S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
-    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
-    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
-    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
-    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
-    S_NOP 0, implicit %55, implicit %65
-    S_NOP 0, implicit %56, implicit %66
-    S_NOP 0, implicit %57, implicit %67
-    S_NOP 0, implicit %58, implicit %68
-    S_NOP 0, implicit %59, implicit %69
-    S_NOP 0, implicit %70, implicit %71
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %74, implicit %75
-    S_NOP 0, implicit %76, implicit %77
-    S_NOP 0, implicit %78, implicit %79
-    S_NOP 0, implicit %80
     S_ENDPGM 0
 ...
-
 ---
-name:            remat_virtual_vgpr_uses_not_available
+name:            test_rollback_remat_useregion_above_target
 tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: remat_virtual_vgpr_uses_not_available
+  ; GFX908-LABEL: name: test_rollback_remat_useregion_above_target
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF22]], implicit [[DEF27]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF23]], implicit [[DEF28]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF24]], implicit [[DEF29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF25]], implicit [[DEF30]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF26]], implicit [[DEF31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[V_CVT_I32_F32_e32_31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
-    liveins: $vgpr0, $sgpr0_sgpr1
+    successors: %bb.1
 
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %2:vgpr_32(s32) = COPY $vgpr0
-    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
-    undef %5.sub1:sreg_64 = S_MOV_B32 0
-    %5.sub0:sreg_64 = COPY %3.sub1
-    %10:vgpr_32 = IMPLICIT_DEF
-    %11:vgpr_32 = IMPLICIT_DEF
-    %12:vgpr_32 = IMPLICIT_DEF
-    %13:vgpr_32 = IMPLICIT_DEF
-    %14:vgpr_32 = IMPLICIT_DEF
-    %15:vgpr_32 = IMPLICIT_DEF
-    %16:vgpr_32 = IMPLICIT_DEF
-    %17:vgpr_32 = IMPLICIT_DEF
-    %18:vgpr_32 = IMPLICIT_DEF
-    %19:vgpr_32 = IMPLICIT_DEF
-    %20:vgpr_32 = IMPLICIT_DEF
-    %21:vgpr_32 = IMPLICIT_DEF
-    %22:vgpr_32 = IMPLICIT_DEF
-    %23:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %26:vgpr_32 = IMPLICIT_DEF
-    %27:vgpr_32 = IMPLICIT_DEF
-    %28:vgpr_32 = IMPLICIT_DEF
-    %29:vgpr_32 = IMPLICIT_DEF
-    %30:vgpr_32 = IMPLICIT_DEF
-    %31:vgpr_32 = IMPLICIT_DEF
-    %32:vgpr_32 = IMPLICIT_DEF
-    %33:vgpr_32 = IMPLICIT_DEF
-    %34:vgpr_32 = IMPLICIT_DEF
-    %35:vgpr_32 = IMPLICIT_DEF
-    %36:vgpr_32 = IMPLICIT_DEF
-    %37:vgpr_32 = IMPLICIT_DEF
-    %38:vgpr_32 = IMPLICIT_DEF
-    %39:vgpr_32 = IMPLICIT_DEF
-    %40:vgpr_32 = IMPLICIT_DEF
-    %41:vgpr_32 = IMPLICIT_DEF
-    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
-    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
-    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
-    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
-    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
-    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
-    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
-    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
-    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
-    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
-    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
-    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
-    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
-    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
-    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
-    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
-    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
-    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
-    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
-    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
-    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
-    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
-    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
-    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
-    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
-    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
-    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
-    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
-    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
-    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
-    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
-    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
-    S_BRANCH %bb.4
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
 
-  bb.4:
-    S_NOP 0, implicit %50, implicit %60, implicit %11, implicit %21
-    S_NOP 0, implicit %51, implicit %61, implicit %12, implicit %22
-    S_NOP 0, implicit %52, implicit %62, implicit %13, implicit %23
-    S_NOP 0, implicit %53, implicit %63, implicit %14, implicit %24
-    S_NOP 0, implicit %54, implicit %64, implicit %15, implicit %25
-    S_NOP 0, implicit %55, implicit %65
-    S_NOP 0, implicit %56, implicit %66
-    S_NOP 0, implicit %57, implicit %67
-    S_NOP 0, implicit %58, implicit %68
-    S_NOP 0, implicit %59, implicit %69
-    S_NOP 0, implicit %70, implicit %71
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %74, implicit %75
-    S_NOP 0, implicit %76, implicit %77
-    S_NOP 0, implicit %78, implicit %79
-    S_NOP 0, implicit %80
-    S_ENDPGM 0
-...
+  bb.1:
+    successors: %bb.2
 
----
-name:            remat_virtual_vgpr_subreg_occ_6
-tracksRegLiveness: true
-machineFunctionInfo:
-  isEntryFunction: true
-body:             |
-  ; GFX908-LABEL: name: remat_virtual_vgpr_subreg_occ_6
-  ; GFX908: bb.0:
-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[DEF:%[0-9]+]].sub0:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub2:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub3:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_BRANCH %bb.1
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT: bb.1:
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF24]], implicit [[DEF28]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF25]], implicit [[DEF29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF26]], implicit [[DEF30]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF27]], implicit [[DEF31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
-  ; GFX908-NEXT:   S_ENDPGM 0
-  bb.0:
-    liveins: $vgpr0, $sgpr0_sgpr1
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
 
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %2:vgpr_32(s32) = COPY $vgpr0
-    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
-    undef %5.sub1:sreg_64 = S_MOV_B32 0
-    %5.sub0:sreg_64 = COPY %3.sub1
-    undef %10.sub0:vreg_512 = IMPLICIT_DEF
-    %10.sub1:vreg_512 = IMPLICIT_DEF
-    %10.sub2:vreg_512 = IMPLICIT_DEF
-    %10.sub3:vreg_512 = IMPLICIT_DEF
-    %11:vgpr_32 = IMPLICIT_DEF
-    %12:vgpr_32 = IMPLICIT_DEF
-    %13:vgpr_32 = IMPLICIT_DEF
-    %14:vgpr_32 = IMPLICIT_DEF
-    %15:vgpr_32 = IMPLICIT_DEF
-    %16:vgpr_32 = IMPLICIT_DEF
-    %17:vgpr_32 = IMPLICIT_DEF
-    %18:vgpr_32 = IMPLICIT_DEF
-    %19:vgpr_32 = IMPLICIT_DEF
-    %20:vgpr_32 = IMPLICIT_DEF
-    %21:vgpr_32 = IMPLICIT_DEF
-    %22:vgpr_32 = IMPLICIT_DEF
-    %23:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %26:vgpr_32 = IMPLICIT_DEF
-    %27:vgpr_32 = IMPLICIT_DEF
-    %28:vgpr_32 = IMPLICIT_DEF
-    %29:vgpr_32 = IMPLICIT_DEF
-    %30:vgpr_32 = IMPLICIT_DEF
-    %31:vgpr_32 = IMPLICIT_DEF
-    %32:vgpr_32 = IMPLICIT_DEF
-    %33:vgpr_32 = IMPLICIT_DEF
-    %34:vgpr_32 = IMPLICIT_DEF
-    %35:vgpr_32 = IMPLICIT_DEF
-    %36:vgpr_32 = IMPLICIT_DEF
-    %37:vgpr_32 = IMPLICIT_DEF
-    %38:vgpr_32 = IMPLICIT_DEF
-    %39:vgpr_32 = IMPLICIT_DEF
-    %40:vgpr_32 = IMPLICIT_DEF
-    %41:vgpr_32 = IMPLICIT_DEF
-    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub0, implicit $exec, implicit $mode
-    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
-    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
-    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
-    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
-    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
-    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
-    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
-    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
-    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
-    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub2, implicit $exec, implicit $mode
-    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
-    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
-    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
-    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
-    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
-    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
-    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
-    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
-    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
-    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
-    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
-    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
-    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
-    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
-    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
-    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
-    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
-    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
-    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
-    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
-    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
-    S_BRANCH %bb.4
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31,  implicit %32
+
+    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
+
+  bb.2:
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31
+    S_NOP 0, implicit %33,  implicit %34
 
-  bb.4:
-    S_NOP 0, implicit %50, implicit %60, implicit %10.sub0, implicit %10.sub2
-    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
-    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
-    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
-    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
-    S_NOP 0, implicit %55, implicit %65
-    S_NOP 0, implicit %56, implicit %66
-    S_NOP 0, implicit %57, implicit %67
-    S_NOP 0, implicit %58, implicit %68
-    S_NOP 0, implicit %59, implicit %69
-    S_NOP 0, implicit %70, implicit %71
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %74, implicit %75
-    S_NOP 0, implicit %76, implicit %77
-    S_NOP 0, implicit %78, implicit %79
-    S_NOP 0, implicit %80
     S_ENDPGM 0
 ...
-
 ---
-name:            remat_virtual_vgpr_subreg_uses_not_available
+name:            test_rollback_remats_emptydefregion
 tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: remat_virtual_vgpr_subreg_uses_not_available
+  ; GFX908-LABEL: name: test_rollback_remats_emptydefregion
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[DEF:%[0-9]+]].sub0:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub2:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub3:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT: bb.1:
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF]].sub1, implicit [[DEF]].sub3
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF24]], implicit [[DEF28]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF30]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF27]], implicit [[DEF31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[V_CVT_I32_F32_e32_31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
-  ; GFX908-NEXT:   S_ENDPGM 0
-  bb.0:
-    liveins: $vgpr0, $sgpr0_sgpr1
-
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %2:vgpr_32(s32) = COPY $vgpr0
-    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
-    undef %5.sub1:sreg_64 = S_MOV_B32 0
-    %5.sub0:sreg_64 = COPY %3.sub1
-    undef %10.sub0:vreg_512 = IMPLICIT_DEF
-    %10.sub1:vreg_512 = IMPLICIT_DEF
-    %10.sub2:vreg_512 = IMPLICIT_DEF
-    %10.sub3:vreg_512 = IMPLICIT_DEF
-    %11:vgpr_32 = IMPLICIT_DEF
-    %12:vgpr_32 = IMPLICIT_DEF
-    %13:vgpr_32 = IMPLICIT_DEF
-    %14:vgpr_32 = IMPLICIT_DEF
-    %15:vgpr_32 = IMPLICIT_DEF
-    %16:vgpr_32 = IMPLICIT_DEF
-    %17:vgpr_32 = IMPLICIT_DEF
-    %18:vgpr_32 = IMPLICIT_DEF
-    %19:vgpr_32 = IMPLICIT_DEF
-    %20:vgpr_32 = IMPLICIT_DEF
-    %21:vgpr_32 = IMPLICIT_DEF
-    %22:vgpr_32 = IMPLICIT_DEF
-    %23:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %26:vgpr_32 = IMPLICIT_DEF
-    %27:vgpr_32 = IMPLICIT_DEF
-    %28:vgpr_32 = IMPLICIT_DEF
-    %29:vgpr_32 = IMPLICIT_DEF
-    %30:vgpr_32 = IMPLICIT_DEF
-    %31:vgpr_32 = IMPLICIT_DEF
-    %32:vgpr_32 = IMPLICIT_DEF
-    %33:vgpr_32 = IMPLICIT_DEF
-    %34:vgpr_32 = IMPLICIT_DEF
-    %35:vgpr_32 = IMPLICIT_DEF
-    %36:vgpr_32 = IMPLICIT_DEF
-    %37:vgpr_32 = IMPLICIT_DEF
-    %38:vgpr_32 = IMPLICIT_DEF
-    %39:vgpr_32 = IMPLICIT_DEF
-    %40:vgpr_32 = IMPLICIT_DEF
-    %41:vgpr_32 = IMPLICIT_DEF
-    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub0, implicit $exec, implicit $mode
-    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
-    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
-    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
-    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
-    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
-    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
-    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
-    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
-    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
-    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub2, implicit $exec, implicit $mode
-    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
-    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
-    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
-    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
-    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
-    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
-    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
-    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
-    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
-    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
-    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
-    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
-    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
-    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
-    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
-    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
-    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
-    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
-    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
-    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
-    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
-    S_BRANCH %bb.4
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2:
+  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]]
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.3:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
 
-  bb.4:
-    S_NOP 0, implicit %50, implicit %60, implicit %10.sub1, implicit %10.sub3
-    S_NOP 0, implicit %51, implicit %61, implicit %12, implicit %22
-    S_NOP 0, implicit %52, implicit %62, implicit %13, implicit %23
-    S_NOP 0, implicit %53, implicit %63, implicit %14, implicit %24
-    S_NOP 0, implicit %54, implicit %64, implicit %15, implicit %25
-    S_NOP 0, implicit %55, implicit %65
-    S_NOP 0, implicit %56, implicit %66
-    S_NOP 0, implicit %57, implicit %67
-    S_NOP 0, implicit %58, implicit %68
-    S_NOP 0, implicit %59, implicit %69
-    S_NOP 0, implicit %70, implicit %71
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %74, implicit %75
-    S_NOP 0, implicit %76, implicit %77
-    S_NOP 0, implicit %78, implicit %79
-    S_NOP 0, implicit %80
-    S_ENDPGM 0
-...
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+
+  bb.1:
+    successors: %bb.2
+
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+
+  bb.2:
+    successors: %bb.3
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33
+
+  bb.3:
 
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31
 
+    S_ENDPGM 0
+...
 ---
-name:            remat_virtual_vgpr_subreg_def
+name:            test_occ_8_physreg_use
 tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: remat_virtual_vgpr_subreg_def
+  ; GFX908-LABEL: name: test_occ_8_physreg_use
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
@@ -7095,95 +7163,93 @@ body:             |
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
   ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[DEF:%[0-9]+]].sub0:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub2:vreg_512 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub3:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
+  ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; GFX908-NEXT:   undef [[V_CVT_I32_F32_e32_1:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr8, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr9, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   undef [[V_CVT_I32_F32_e32_1:%[0-9]+]].sub0:vreg_64 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]].sub0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF1]], implicit [[DEF28]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF26]], implicit [[DEF30]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF27]], implicit [[DEF31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_12]], implicit [[V_CVT_I32_F32_e32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_14]], implicit [[V_CVT_I32_F32_e32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_14]], implicit [[V_CVT_I32_F32_e32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_16]], implicit [[V_CVT_I32_F32_e32_17]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_18]], implicit [[V_CVT_I32_F32_e32_19]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_20]], implicit [[V_CVT_I32_F32_e32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_7]], implicit [[V_CVT_I32_F32_e32_15]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_8]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_9]], implicit [[V_CVT_I32_F32_e32_17]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_10]], implicit [[V_CVT_I32_F32_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_27]], implicit [[V_CVT_I32_F32_e32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -7194,10 +7260,8 @@ body:             |
     %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
     undef %5.sub1:sreg_64 = S_MOV_B32 0
     %5.sub0:sreg_64 = COPY %3.sub1
-    undef %10.sub0:vreg_512 = IMPLICIT_DEF
-    %10.sub1:vreg_512 = IMPLICIT_DEF
-    %10.sub2:vreg_512 = IMPLICIT_DEF
-    %10.sub3:vreg_512 = IMPLICIT_DEF
+    $vgpr8 = IMPLICIT_DEF
+    $vgpr9 = IMPLICIT_DEF
     %11:vgpr_32 = IMPLICIT_DEF
     %12:vgpr_32 = IMPLICIT_DEF
     %13:vgpr_32 = IMPLICIT_DEF
@@ -7229,8 +7293,8 @@ body:             |
     %39:vgpr_32 = IMPLICIT_DEF
     %40:vgpr_32 = IMPLICIT_DEF
     %41:vgpr_32 = IMPLICIT_DEF
-    undef %50.sub0:vreg_64 = nofpexcept V_CVT_I32_F32_e32 %10.sub0, implicit $exec, implicit $mode
-    %50.sub1:vreg_64 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr8, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr9, implicit $exec, implicit $mode
     %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
     %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
     %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
@@ -7239,7 +7303,7 @@ body:             |
     %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
     %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
     %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
-    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub2, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
     %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
     %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
     %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
@@ -7264,11 +7328,12 @@ body:             |
     S_BRANCH %bb.4
 
   bb.4:
-    S_NOP 0, implicit %50.sub0, implicit %60, implicit %10.sub0, implicit %10.sub2
-    S_NOP 0, implicit %52, implicit %61, implicit %11, implicit %21
-    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
-    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
-    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+
+    S_NOP 0, implicit %52, implicit %62
+    S_NOP 0, implicit %50, implicit %60, implicit %20
+    S_NOP 0, implicit %51, implicit %61, implicit %21
+    S_NOP 0, implicit %53, implicit %63
+    S_NOP 0, implicit %54, implicit %64
     S_NOP 0, implicit %55, implicit %65
     S_NOP 0, implicit %56, implicit %66
     S_NOP 0, implicit %57, implicit %67
@@ -7283,20 +7348,22 @@ body:             |
     S_NOP 0, implicit %80
     S_ENDPGM 0
 ...
-
-# The remat candidate (instruction defining %3) uses a register (%2) which does not have a subrange for every subregister (sub1 is undefined).
-# Be sure that when checking if we can rematerialize %3, that we handle the liveness checking of %2 properly.
-
 ---
-name:            omitted_subrange
+name:            test_occ_8_exec_use
 tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: omitted_subrange
+  ; GFX908-LABEL: name: test_occ_8_exec_use
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -7314,115 +7381,89 @@ body:             |
   ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
+  ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; GFX908-NEXT:   undef [[V_RCP_F32_e32_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 [[DEF32]].sub0, implicit $mode, implicit $exec
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   dead [[DEF33:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[DEF34:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
-  ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %temp:vgpr_32 = IMPLICIT_DEF
-  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr3, $sgpr4, implicit-def $scc
-  ; GFX908-NEXT:   [[DEF34:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, [[DEF32]].sub0, 1, %temp, 0, 0, implicit $mode, implicit $exec
-  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-  ; GFX908-NEXT:   S_BRANCH %bb.3
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   undef [[V_FMA_F32_e64_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, [[DEF34]].sub1, 0, [[V_RCP_F32_e32_]].sub0, 0, [[DEF34]].sub0, 0, 0, implicit $mode, implicit $exec
-  ; GFX908-NEXT:   %temp2:vreg_64_align2 = IMPLICIT_DEF
-  ; GFX908-NEXT:   [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, [[V_RCP_F32_e32_]], 8, [[DEF32]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GFX908-NEXT:   dead [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, [[V_FMA_F32_e64_]], 8, %temp2, 11, [[V_PK_MUL_F32_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_23]], implicit [[DEF22]], implicit [[DEF27]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF23]], implicit [[DEF28]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF24]], implicit [[DEF29]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
-  ; GFX908-NEXT:   dead [[DEF35:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF25]], implicit [[DEF30]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF26]], implicit [[DEF31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF32]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 255
+  ; GFX908-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 [[S_MOV_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_9]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_7]], implicit [[V_CVT_I32_F32_e32_15]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_8]], implicit [[V_CVT_I32_F32_e32_16]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
-  ; GFX908-NEXT:   S_BRANCH %bb.3
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT: bb.3:
-  ; GFX908-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
-  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
-  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-  ; GFX908-NEXT:   S_BRANCH %bb.4
-  ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT: bb.4:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_27]]
+  ; GFX908-NEXT:   $exec = S_MOV_B64 [[S_AND_SAVEEXEC_B64_]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
-    liveins: $sgpr3, $sgpr4
-
-    %0:vreg_64 = IMPLICIT_DEF
-    %1:vreg_64_align2 = IMPLICIT_DEF
-    undef %2.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 %1.sub0:vreg_64_align2, implicit $mode, implicit $exec
-    %3:vreg_64_align2 =  nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, %2, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    %5:vreg_64_align2 = IMPLICIT_DEF
-    %10:vgpr_32 = IMPLICIT_DEF
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %2:vgpr_32(s32) = COPY $vgpr0
+    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
+    undef %5.sub1:sreg_64 = S_MOV_B32 0
+    %5.sub0:sreg_64 = COPY %3.sub1
+    $vgpr8 = IMPLICIT_DEF
+    $vgpr9 = IMPLICIT_DEF
     %11:vgpr_32 = IMPLICIT_DEF
     %12:vgpr_32 = IMPLICIT_DEF
     %13:vgpr_32 = IMPLICIT_DEF
@@ -7454,8 +7495,8 @@ body:             |
     %39:vgpr_32 = IMPLICIT_DEF
     %40:vgpr_32 = IMPLICIT_DEF
     %41:vgpr_32 = IMPLICIT_DEF
-    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
-    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
     %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
     %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
     %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
@@ -7486,52 +7527,29 @@ body:             |
     %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
     %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
     %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
-    S_BRANCH %bb.1
-
-  bb.1:
-    liveins: $sgpr3, $sgpr4
-
-    %temp:vgpr_32 = IMPLICIT_DEF
-    %5.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, %1.sub0:vreg_64_align2, 1, %temp, 0, 0, implicit $mode, implicit $exec
-
-    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
-    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
     S_BRANCH %bb.4
 
-  bb.3:
-    liveins: $sgpr3, $sgpr4
-
-    %6:vreg_64_align2 = IMPLICIT_DEF
-    undef %7.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, %5.sub1, 0, %2.sub0:vreg_64_align2, 0, %5.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
-    %temp2:vreg_64_align2 = IMPLICIT_DEF
-    %8:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, %7:vreg_64_align2, 8, %temp2, 11, %3:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
-    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
-    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
-    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
-    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
-    S_NOP 0, implicit %55, implicit %65, implicit %1
+  bb.4:
+
+    %100:sreg_64 = S_MOV_B64 255
+    %101:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %100, implicit-def $exec, implicit-def $scc, implicit $exec
+    S_NOP 0, implicit %52, implicit %62
+    S_NOP 0, implicit %50, implicit %60, implicit %20
+    S_NOP 0, implicit %51, implicit %61, implicit %21
+    S_NOP 0, implicit %53, implicit %63
+    S_NOP 0, implicit %54, implicit %64
+    S_NOP 0, implicit %55, implicit %65
     S_NOP 0, implicit %56, implicit %66
     S_NOP 0, implicit %57, implicit %67
     S_NOP 0, implicit %58, implicit %68
     S_NOP 0, implicit %59, implicit %69
     S_NOP 0, implicit %70, implicit %71
     S_NOP 0, implicit %72, implicit %73
-    S_NOP 0, implicit %72, implicit %73
     S_NOP 0, implicit %74, implicit %75
     S_NOP 0, implicit %76, implicit %77
     S_NOP 0, implicit %78, implicit %79
     S_NOP 0, implicit %80
-    S_BRANCH %bb.4
-
-  bb.4:
-    liveins: $sgpr3, $sgpr4
-
-    S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
-    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-    S_BRANCH %bb.2
-
-  bb.2:
+    $exec = S_MOV_B64 %101:sreg_64_xexec
     S_ENDPGM 0
-
 ...
+

>From 51683490d2e4c6661cca52fc83c64778e702f49d Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 5 Feb 2025 18:13:25 +0000
Subject: [PATCH 2/8] Fix spurious auto-formats and typo

The typo in GCNSubtarget::getMaxNumArchVGPRs made the function return
the wrong value in all cases. The returned value is now correct; this
causes some unit tests to break in largely cosmetic ways, since that
value is used to calculate excess RP in the remat stage.
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  11 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         | 576 +++++++++++++-----
 .../atomic_optimizations_global_pointer.ll    |   4 +-
 .../atomic_optimizations_local_pointer.ll     | 141 ++---
 .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll |  41 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        |  50 +-
 .../AMDGPU/global_atomics_scan_fmax.ll        |  22 +-
 .../AMDGPU/global_atomics_scan_fmin.ll        |  22 +-
 .../AMDGPU/global_atomics_scan_fsub.ll        |  70 +--
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |  14 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll |   4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll |   4 +-
 .../llvm.amdgcn.struct.atomic.buffer.load.ll  |   7 +-
 ...vm.amdgcn.struct.ptr.atomic.buffer.load.ll |   7 +-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll |   8 +-
 .../CodeGen/AMDGPU/no-dup-inst-prefetch.ll    |   2 +-
 .../AMDGPU/pal-metadata-3.0-callable.ll       |   2 +-
 17 files changed, 636 insertions(+), 349 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 01cbd0aefba39..fbf1f9dae03a5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -329,7 +329,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const RegPressureTracker &RPTracker,
                                          SchedCandidate &Cand,
                                          bool IsBottomUp) {
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
@@ -1403,7 +1403,8 @@ GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
 #ifndef NDEBUG
   LLVM_DEBUG(
       printScheduleModel(ReadyCyclesSorted);
-      dbgs() << "\n\t" << "Metric: "
+      dbgs() << "\n\t"
+             << "Metric: "
              << (SumBubbles
                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
                      : 1)
@@ -1438,7 +1439,8 @@ GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
 #ifndef NDEBUG
   LLVM_DEBUG(
       printScheduleModel(ReadyCyclesSorted);
-      dbgs() << "\n\t" << "Metric: "
+      dbgs() << "\n\t"
+             << "Metric: "
              << (SumBubbles
                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
                      : 1)
@@ -1486,7 +1488,8 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
       dbgs()
       << "\n\t      *** In shouldRevertScheduling ***\n"
       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
-  ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
+  ScheduleMetrics MBefore =
+      getScheduleMetrics(DAG.SUnits);
   LLVM_DEBUG(
       dbgs()
       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 29acb91afb82c..8345a498045f5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -38,12 +38,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // Following 2 enums are documented at:
   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
   enum class TrapHandlerAbi {
-    NONE = 0x00,
+    NONE   = 0x00,
     AMDHSA = 0x01,
   };
 
   enum class TrapID {
-    LLVMAMDHSATrap = 0x02,
+    LLVMAMDHSATrap      = 0x02,
     LLVMAMDHSADebugTrap = 0x03,
   };
 
@@ -269,20 +269,24 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
                const GCNTargetMachine &TM);
   ~GCNSubtarget() override;
 
-  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
-                                                StringRef FS);
+  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                   StringRef GPU, StringRef FS);
 
   /// Diagnose inconsistent subtarget features before attempting to codegen
   /// function \p F.
   void checkSubtargetFeatures(const Function &F) const;
 
-  const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const SIInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
+  }
 
   const SIFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
 
-  const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
+  const SITargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
 
   const SIRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
@@ -320,7 +324,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
-  Generation getGeneration() const { return (Generation)Gen; }
+  Generation getGeneration() const {
+    return (Generation)Gen;
+  }
 
   unsigned getMaxWaveScratchSize() const {
     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
@@ -341,7 +347,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
   }
 
-  int getLDSBankCount() const { return LDSBankCount; }
+  int getLDSBankCount() const {
+    return LDSBankCount;
+  }
 
   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
@@ -356,17 +364,29 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool supportsWGP() const { return getGeneration() >= GFX10; }
 
-  bool hasIntClamp() const { return HasIntClamp; }
+  bool hasIntClamp() const {
+    return HasIntClamp;
+  }
 
-  bool hasFP64() const { return FP64; }
+  bool hasFP64() const {
+    return FP64;
+  }
 
-  bool hasMIMG_R128() const { return MIMG_R128; }
+  bool hasMIMG_R128() const {
+    return MIMG_R128;
+  }
 
-  bool hasHWFP64() const { return FP64; }
+  bool hasHWFP64() const {
+    return FP64;
+  }
 
-  bool hasHalfRate64Ops() const { return HalfRate64Ops; }
+  bool hasHalfRate64Ops() const {
+    return HalfRate64Ops;
+  }
 
-  bool hasFullRate64Ops() const { return FullRate64Ops; }
+  bool hasFullRate64Ops() const {
+    return FullRate64Ops;
+  }
 
   bool hasAddr64() const {
     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
@@ -382,37 +402,65 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
-  bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
+  bool hasFractBug() const {
+    return getGeneration() == SOUTHERN_ISLANDS;
+  }
 
-  bool hasBFE() const { return true; }
+  bool hasBFE() const {
+    return true;
+  }
 
-  bool hasBFI() const { return true; }
+  bool hasBFI() const {
+    return true;
+  }
 
-  bool hasBFM() const { return hasBFE(); }
+  bool hasBFM() const {
+    return hasBFE();
+  }
 
-  bool hasBCNT(unsigned Size) const { return true; }
+  bool hasBCNT(unsigned Size) const {
+    return true;
+  }
 
-  bool hasFFBL() const { return true; }
+  bool hasFFBL() const {
+    return true;
+  }
 
-  bool hasFFBH() const { return true; }
+  bool hasFFBH() const {
+    return true;
+  }
 
-  bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
+  bool hasMed3_16() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
 
   bool hasMin3Max3_16() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  bool hasFmaMixInsts() const { return HasFmaMixInsts; }
+  bool hasFmaMixInsts() const {
+    return HasFmaMixInsts;
+  }
 
-  bool hasCARRY() const { return true; }
+  bool hasCARRY() const {
+    return true;
+  }
 
-  bool hasFMA() const { return FMA; }
+  bool hasFMA() const {
+    return FMA;
+  }
 
-  bool hasSwap() const { return GFX9Insts; }
+  bool hasSwap() const {
+    return GFX9Insts;
+  }
 
-  bool hasScalarPackInsts() const { return GFX9Insts; }
+  bool hasScalarPackInsts() const {
+    return GFX9Insts;
+  }
 
-  bool hasScalarMulHiInsts() const { return GFX9Insts; }
+  bool hasScalarMulHiInsts() const {
+    return GFX9Insts;
+  }
 
   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
 
@@ -427,7 +475,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// True if the offset field of DS instructions works as expected. On SI, the
   /// offset uses a 16-bit adder and does not always wrap properly.
-  bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
+  bool hasUsableDSOffset() const {
+    return getGeneration() >= SEA_ISLANDS;
+  }
 
   bool unsafeDSOffsetFoldingEnabled() const {
     return EnableUnsafeDSOffsetFolding;
@@ -440,10 +490,14 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// Extra wait hazard is needed in some cases before
   /// s_cbranch_vccnz/s_cbranch_vccz.
-  bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
+  bool hasReadVCCZBug() const {
+    return getGeneration() <= SEA_ISLANDS;
+  }
 
   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
-  bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
+  bool partialVCCWritesUpdateVCCZ() const {
+    return getGeneration() >= GFX10;
+  }
 
   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
   /// was written by a VALU instruction.
@@ -457,14 +511,18 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
-  bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
+  bool hasRFEHazards() const {
+    return getGeneration() >= VOLCANIC_ISLANDS;
+  }
 
   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
   unsigned getSetRegWaitStates() const {
     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
   }
 
-  bool dumpCode() const { return DumpCode; }
+  bool dumpCode() const {
+    return DumpCode;
+  }
 
   /// Return the amount of LDS that can be used that will not restrict the
   /// occupancy lower than WaveCount.
@@ -480,17 +538,25 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= AMDGPUSubtarget::GFX10;
   }
 
-  bool useFlatForGlobal() const { return FlatForGlobal; }
+  bool useFlatForGlobal() const {
+    return FlatForGlobal;
+  }
 
   /// \returns If target supports ds_read/write_b128 and user enables generation
   /// of ds_read/write_b128.
-  bool useDS128() const { return CIInsts && EnableDS128; }
+  bool useDS128() const {
+    return CIInsts && EnableDS128;
+  }
 
   /// \return If target supports ds_read/write_b96/128.
-  bool hasDS96AndDS128() const { return CIInsts; }
+  bool hasDS96AndDS128() const {
+    return CIInsts;
+  }
 
   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
-  bool haveRoundOpsF64() const { return CIInsts; }
+  bool haveRoundOpsF64() const {
+    return CIInsts;
+  }
 
   /// \returns If MUBUF instructions always perform range checking, even for
   /// buffer resources used for private memory access.
@@ -500,55 +566,89 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// \returns If target requires PRT Struct NULL support (zero result registers
   /// for sparse texture support).
-  bool usePRTStrictNull() const { return EnablePRTStrictNull; }
+  bool usePRTStrictNull() const {
+    return EnablePRTStrictNull;
+  }
 
-  bool hasAutoWaitcntBeforeBarrier() const { return AutoWaitcntBeforeBarrier; }
+  bool hasAutoWaitcntBeforeBarrier() const {
+    return AutoWaitcntBeforeBarrier;
+  }
 
   /// \returns true if the target supports backing off of s_barrier instructions
   /// when an exception is raised.
-  bool supportsBackOffBarrier() const { return BackOffBarrier; }
+  bool supportsBackOffBarrier() const {
+    return BackOffBarrier;
+  }
 
-  bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; }
+  bool hasUnalignedBufferAccess() const {
+    return UnalignedBufferAccess;
+  }
 
   bool hasUnalignedBufferAccessEnabled() const {
     return UnalignedBufferAccess && UnalignedAccessMode;
   }
 
-  bool hasUnalignedDSAccess() const { return UnalignedDSAccess; }
+  bool hasUnalignedDSAccess() const {
+    return UnalignedDSAccess;
+  }
 
   bool hasUnalignedDSAccessEnabled() const {
     return UnalignedDSAccess && UnalignedAccessMode;
   }
 
-  bool hasUnalignedScratchAccess() const { return UnalignedScratchAccess; }
+  bool hasUnalignedScratchAccess() const {
+    return UnalignedScratchAccess;
+  }
 
   bool hasUnalignedScratchAccessEnabled() const {
     return UnalignedScratchAccess && UnalignedAccessMode;
   }
 
-  bool hasUnalignedAccessMode() const { return UnalignedAccessMode; }
+  bool hasUnalignedAccessMode() const {
+    return UnalignedAccessMode;
+  }
 
-  bool hasApertureRegs() const { return HasApertureRegs; }
+  bool hasApertureRegs() const {
+    return HasApertureRegs;
+  }
 
-  bool isTrapHandlerEnabled() const { return TrapHandler; }
+  bool isTrapHandlerEnabled() const {
+    return TrapHandler;
+  }
 
-  bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
+  bool isXNACKEnabled() const {
+    return TargetID.isXnackOnOrAny();
+  }
 
-  bool isTgSplitEnabled() const { return EnableTgSplit; }
+  bool isTgSplitEnabled() const {
+    return EnableTgSplit;
+  }
 
-  bool isCuModeEnabled() const { return EnableCuMode; }
+  bool isCuModeEnabled() const {
+    return EnableCuMode;
+  }
 
   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
 
-  bool hasFlatAddressSpace() const { return FlatAddressSpace; }
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
 
-  bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
+  bool hasFlatScrRegister() const {
+    return hasFlatAddressSpace();
+  }
 
-  bool hasFlatInstOffsets() const { return FlatInstOffsets; }
+  bool hasFlatInstOffsets() const {
+    return FlatInstOffsets;
+  }
 
-  bool hasFlatGlobalInsts() const { return FlatGlobalInsts; }
+  bool hasFlatGlobalInsts() const {
+    return FlatGlobalInsts;
+  }
 
-  bool hasFlatScratchInsts() const { return FlatScratchInsts; }
+  bool hasFlatScratchInsts() const {
+    return FlatScratchInsts;
+  }
 
   // Check if target supports ST addressing mode with FLAT scratch instructions.
   // The ST addressing mode means no registers are used, either VGPR or SGPR,
@@ -559,20 +659,30 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
 
-  bool hasScalarFlatScratchInsts() const { return ScalarFlatScratchInsts; }
+  bool hasScalarFlatScratchInsts() const {
+    return ScalarFlatScratchInsts;
+  }
 
   bool enableFlatScratch() const {
     return flatScratchIsArchitected() ||
            (EnableFlatScratch && hasFlatScratchInsts());
   }
 
-  bool hasGlobalAddTidInsts() const { return GFX10_BEncoding; }
+  bool hasGlobalAddTidInsts() const {
+    return GFX10_BEncoding;
+  }
 
-  bool hasAtomicCSub() const { return GFX10_BEncoding; }
+  bool hasAtomicCSub() const {
+    return GFX10_BEncoding;
+  }
 
-  bool hasExportInsts() const { return !hasGFX940Insts(); }
+  bool hasExportInsts() const {
+    return !hasGFX940Insts();
+  }
 
-  bool hasVINTERPEncoding() const { return GFX11Insts; }
+  bool hasVINTERPEncoding() const {
+    return GFX11Insts;
+  }
 
   // DS_ADD_F64/DS_ADD_RTN_F64
   bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
@@ -581,98 +691,162 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= GFX9;
   }
 
-  bool hasFlatSegmentOffsetBug() const { return HasFlatSegmentOffsetBug; }
+  bool hasFlatSegmentOffsetBug() const {
+    return HasFlatSegmentOffsetBug;
+  }
 
-  bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
+  bool hasFlatLgkmVMemCountInOrder() const {
+    return getGeneration() > GFX9;
+  }
 
-  bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
+  bool hasD16LoadStore() const {
+    return getGeneration() >= GFX9;
+  }
 
   bool d16PreservesUnusedBits() const {
     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
   }
 
-  bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
+  bool hasD16Images() const {
+    return getGeneration() >= VOLCANIC_ISLANDS;
+  }
 
   /// Return if most LDS instructions have an m0 use that require m0 to be
   /// initialized.
-  bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
+  bool ldsRequiresM0Init() const {
+    return getGeneration() < GFX9;
+  }
 
   // True if the hardware rewinds and replays GWS operations if a wave is
   // preempted.
   //
   // If this is false, a GWS operation requires testing if a nack set the
   // MEM_VIOL bit, and repeating if so.
-  bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
+  bool hasGWSAutoReplay() const {
+    return getGeneration() >= GFX9;
+  }
 
   /// \returns if target has ds_gws_sema_release_all instruction.
-  bool hasGWSSemaReleaseAll() const { return CIInsts; }
+  bool hasGWSSemaReleaseAll() const {
+    return CIInsts;
+  }
 
   /// \returns true if the target has integer add/sub instructions that do not
   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
   /// for saturation.
-  bool hasAddNoCarry() const { return AddNoCarryInsts; }
+  bool hasAddNoCarry() const {
+    return AddNoCarryInsts;
+  }
 
   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
 
   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
 
-  bool hasUnpackedD16VMem() const { return HasUnpackedD16VMem; }
+  bool hasUnpackedD16VMem() const {
+    return HasUnpackedD16VMem;
+  }
 
   // Covers VS/PS/CS graphics shaders
   bool isMesaGfxShader(const Function &F) const {
     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
   }
 
-  bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
+  bool hasMad64_32() const {
+    return getGeneration() >= SEA_ISLANDS;
+  }
 
-  bool hasSDWAOmod() const { return HasSDWAOmod; }
+  bool hasSDWAOmod() const {
+    return HasSDWAOmod;
+  }
 
-  bool hasSDWAScalar() const { return HasSDWAScalar; }
+  bool hasSDWAScalar() const {
+    return HasSDWAScalar;
+  }
 
-  bool hasSDWASdst() const { return HasSDWASdst; }
+  bool hasSDWASdst() const {
+    return HasSDWASdst;
+  }
 
-  bool hasSDWAMac() const { return HasSDWAMac; }
+  bool hasSDWAMac() const {
+    return HasSDWAMac;
+  }
 
-  bool hasSDWAOutModsVOPC() const { return HasSDWAOutModsVOPC; }
+  bool hasSDWAOutModsVOPC() const {
+    return HasSDWAOutModsVOPC;
+  }
 
-  bool hasDLInsts() const { return HasDLInsts; }
+  bool hasDLInsts() const {
+    return HasDLInsts;
+  }
 
   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
 
-  bool hasDot1Insts() const { return HasDot1Insts; }
+  bool hasDot1Insts() const {
+    return HasDot1Insts;
+  }
 
-  bool hasDot2Insts() const { return HasDot2Insts; }
+  bool hasDot2Insts() const {
+    return HasDot2Insts;
+  }
 
-  bool hasDot3Insts() const { return HasDot3Insts; }
+  bool hasDot3Insts() const {
+    return HasDot3Insts;
+  }
 
-  bool hasDot4Insts() const { return HasDot4Insts; }
+  bool hasDot4Insts() const {
+    return HasDot4Insts;
+  }
 
-  bool hasDot5Insts() const { return HasDot5Insts; }
+  bool hasDot5Insts() const {
+    return HasDot5Insts;
+  }
 
-  bool hasDot6Insts() const { return HasDot6Insts; }
+  bool hasDot6Insts() const {
+    return HasDot6Insts;
+  }
 
-  bool hasDot7Insts() const { return HasDot7Insts; }
+  bool hasDot7Insts() const {
+    return HasDot7Insts;
+  }
 
-  bool hasDot8Insts() const { return HasDot8Insts; }
+  bool hasDot8Insts() const {
+    return HasDot8Insts;
+  }
 
-  bool hasDot9Insts() const { return HasDot9Insts; }
+  bool hasDot9Insts() const {
+    return HasDot9Insts;
+  }
 
-  bool hasDot10Insts() const { return HasDot10Insts; }
+  bool hasDot10Insts() const {
+    return HasDot10Insts;
+  }
 
-  bool hasDot11Insts() const { return HasDot11Insts; }
+  bool hasDot11Insts() const {
+    return HasDot11Insts;
+  }
 
-  bool hasDot12Insts() const { return HasDot12Insts; }
+  bool hasDot12Insts() const {
+    return HasDot12Insts;
+  }
 
-  bool hasDot13Insts() const { return HasDot13Insts; }
+  bool hasDot13Insts() const {
+    return HasDot13Insts;
+  }
 
-  bool hasMAIInsts() const { return HasMAIInsts; }
+  bool hasMAIInsts() const {
+    return HasMAIInsts;
+  }
 
-  bool hasFP8Insts() const { return HasFP8Insts; }
+  bool hasFP8Insts() const {
+    return HasFP8Insts;
+  }
 
   bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
 
-  bool hasPkFmacF16Inst() const { return HasPkFmacF16Inst; }
+  bool hasPkFmacF16Inst() const {
+    return HasPkFmacF16Inst;
+  }
 
   bool hasAtomicFMinFMaxF32GlobalInsts() const {
     return HasAtomicFMinFMaxF32GlobalInsts;
@@ -745,23 +919,37 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasDefaultComponentBroadcast;
   }
 
-  bool hasNoSdstCMPX() const { return HasNoSdstCMPX; }
+  bool hasNoSdstCMPX() const {
+    return HasNoSdstCMPX;
+  }
 
-  bool hasVscnt() const { return HasVscnt; }
+  bool hasVscnt() const {
+    return HasVscnt;
+  }
 
-  bool hasGetWaveIdInst() const { return HasGetWaveIdInst; }
+  bool hasGetWaveIdInst() const {
+    return HasGetWaveIdInst;
+  }
 
-  bool hasSMemTimeInst() const { return HasSMemTimeInst; }
+  bool hasSMemTimeInst() const {
+    return HasSMemTimeInst;
+  }
 
-  bool hasShaderCyclesRegister() const { return HasShaderCyclesRegister; }
+  bool hasShaderCyclesRegister() const {
+    return HasShaderCyclesRegister;
+  }
 
   bool hasShaderCyclesHiLoRegisters() const {
     return HasShaderCyclesHiLoRegisters;
   }
 
-  bool hasVOP3Literal() const { return HasVOP3Literal; }
+  bool hasVOP3Literal() const {
+    return HasVOP3Literal;
+  }
 
-  bool hasNoDataDepHazard() const { return HasNoDataDepHazard; }
+  bool hasNoDataDepHazard() const {
+    return HasNoDataDepHazard;
+  }
 
   bool vmemWriteNeedsExpWaitcnt() const {
     return getGeneration() < SEA_ISLANDS;
@@ -786,11 +974,15 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // dynamic realignment in common cases.
   Align getStackAlignment() const { return Align(16); }
 
-  bool enableMachineScheduler() const override { return true; }
+  bool enableMachineScheduler() const override {
+    return true;
+  }
 
   bool useAA() const override;
 
-  bool enableSubRegLiveness() const override { return true; }
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
 
   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
@@ -799,7 +991,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 
   // XXX - Why is this here if it isn't in the default pass set?
-  bool enableEarlyIfConversion() const override { return true; }
+  bool enableEarlyIfConversion() const override {
+    return true;
+  }
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
@@ -810,11 +1004,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::getMaxNumUserSGPRs(*this);
   }
 
-  bool hasSMemRealTime() const { return HasSMemRealTime; }
+  bool hasSMemRealTime() const {
+    return HasSMemRealTime;
+  }
 
-  bool hasMovrel() const { return HasMovrel; }
+  bool hasMovrel() const {
+    return HasMovrel;
+  }
 
-  bool hasVGPRIndexMode() const { return HasVGPRIndexMode; }
+  bool hasVGPRIndexMode() const {
+    return HasVGPRIndexMode;
+  }
 
   bool useVGPRIndexMode() const;
 
@@ -824,9 +1024,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
 
-  bool hasScalarStores() const { return HasScalarStores; }
+  bool hasScalarStores() const {
+    return HasScalarStores;
+  }
 
-  bool hasScalarAtomics() const { return HasScalarAtomics; }
+  bool hasScalarAtomics() const {
+    return HasScalarAtomics;
+  }
 
   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
@@ -837,40 +1041,60 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
   bool hasPermLane64() const { return getGeneration() >= GFX11; }
 
-  bool hasDPP() const { return HasDPP; }
+  bool hasDPP() const {
+    return HasDPP;
+  }
 
-  bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; }
+  bool hasDPPBroadcasts() const {
+    return HasDPP && getGeneration() < GFX10;
+  }
 
   bool hasDPPWavefrontShifts() const {
     return HasDPP && getGeneration() < GFX10;
   }
 
-  bool hasDPP8() const { return HasDPP8; }
+  bool hasDPP8() const {
+    return HasDPP8;
+  }
 
-  bool hasDPALU_DPP() const { return HasDPALU_DPP; }
+  bool hasDPALU_DPP() const {
+    return HasDPALU_DPP;
+  }
 
   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
 
-  bool hasPackedFP32Ops() const { return HasPackedFP32Ops; }
+  bool hasPackedFP32Ops() const {
+    return HasPackedFP32Ops;
+  }
 
   // Has V_PK_MOV_B32 opcode
-  bool hasPkMovB32() const { return GFX90AInsts; }
+  bool hasPkMovB32() const {
+    return GFX90AInsts;
+  }
 
   bool hasFmaakFmamkF32Insts() const {
     return getGeneration() >= GFX10 || hasGFX940Insts();
   }
 
-  bool hasImageInsts() const { return HasImageInsts; }
+  bool hasImageInsts() const {
+    return HasImageInsts;
+  }
 
-  bool hasExtendedImageInsts() const { return HasExtendedImageInsts; }
+  bool hasExtendedImageInsts() const {
+    return HasExtendedImageInsts;
+  }
 
-  bool hasR128A16() const { return HasR128A16; }
+  bool hasR128A16() const {
+    return HasR128A16;
+  }
 
   bool hasA16() const { return HasA16; }
 
   bool hasG16() const { return HasG16; }
 
-  bool hasOffset3fBug() const { return HasOffset3fBug; }
+  bool hasOffset3fBug() const {
+    return HasOffset3fBug;
+  }
 
   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
 
@@ -892,11 +1116,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::getNSAMaxSize(*this, HasSampler);
   }
 
-  bool hasGFX10_AEncoding() const { return GFX10_AEncoding; }
+  bool hasGFX10_AEncoding() const {
+    return GFX10_AEncoding;
+  }
 
-  bool hasGFX10_BEncoding() const { return GFX10_BEncoding; }
+  bool hasGFX10_BEncoding() const {
+    return GFX10_BEncoding;
+  }
 
-  bool hasGFX10_3Insts() const { return GFX10_3Insts; }
+  bool hasGFX10_3Insts() const {
+    return GFX10_3Insts;
+  }
 
   bool hasMadF16() const;
 
@@ -904,13 +1134,21 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasLshlAddB64() const { return GFX940Insts; }
 
-  bool enableSIScheduler() const { return EnableSIScheduler; }
+  bool enableSIScheduler() const {
+    return EnableSIScheduler;
+  }
 
-  bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
+  bool loadStoreOptEnabled() const {
+    return EnableLoadStoreOpt;
+  }
 
-  bool hasSGPRInitBug() const { return SGPRInitBug; }
+  bool hasSGPRInitBug() const {
+    return SGPRInitBug;
+  }
 
-  bool hasUserSGPRInit16Bug() const { return UserSGPRInit16Bug && isWave32(); }
+  bool hasUserSGPRInit16Bug() const {
+    return UserSGPRInit16Bug && isWave32();
+  }
 
   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
 
@@ -918,14 +1156,18 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return NegativeUnalignedScratchOffsetBug;
   }
 
-  bool hasMFMAInlineLiteralBug() const { return HasMFMAInlineLiteralBug; }
+  bool hasMFMAInlineLiteralBug() const {
+    return HasMFMAInlineLiteralBug;
+  }
 
   bool has12DWordStoreHazard() const {
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
 
   // \returns true if the subtarget supports DWORDX3 load/store instructions.
-  bool hasDwordx3LoadStores() const { return CIInsts; }
+  bool hasDwordx3LoadStores() const {
+    return CIInsts;
+  }
 
   bool hasReadM0MovRelInterpHazard() const {
     return getGeneration() == AMDGPUSubtarget::GFX9;
@@ -944,23 +1186,39 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() == AMDGPUSubtarget::GFX9;
   }
 
-  bool hasVcmpxPermlaneHazard() const { return HasVcmpxPermlaneHazard; }
+  bool hasVcmpxPermlaneHazard() const {
+    return HasVcmpxPermlaneHazard;
+  }
 
-  bool hasVMEMtoScalarWriteHazard() const { return HasVMEMtoScalarWriteHazard; }
+  bool hasVMEMtoScalarWriteHazard() const {
+    return HasVMEMtoScalarWriteHazard;
+  }
 
-  bool hasSMEMtoVectorWriteHazard() const { return HasSMEMtoVectorWriteHazard; }
+  bool hasSMEMtoVectorWriteHazard() const {
+    return HasSMEMtoVectorWriteHazard;
+  }
 
-  bool hasLDSMisalignedBug() const { return LDSMisalignedBug && !EnableCuMode; }
+  bool hasLDSMisalignedBug() const {
+    return LDSMisalignedBug && !EnableCuMode;
+  }
 
-  bool hasInstFwdPrefetchBug() const { return HasInstFwdPrefetchBug; }
+  bool hasInstFwdPrefetchBug() const {
+    return HasInstFwdPrefetchBug;
+  }
 
-  bool hasVcmpxExecWARHazard() const { return HasVcmpxExecWARHazard; }
+  bool hasVcmpxExecWARHazard() const {
+    return HasVcmpxExecWARHazard;
+  }
 
-  bool hasLdsBranchVmemWARHazard() const { return HasLdsBranchVmemWARHazard; }
+  bool hasLdsBranchVmemWARHazard() const {
+    return HasLdsBranchVmemWARHazard;
+  }
 
   // Shift amount of a 64 bit shift cannot be a highest allocated register
   // if also at the end of the allocation block.
-  bool hasShift64HighRegBug() const { return GFX90AInsts && !GFX940Insts; }
+  bool hasShift64HighRegBug() const {
+    return GFX90AInsts && !GFX940Insts;
+  }
 
   // Has one cycle hazard on transcendental instruction feeding a
   // non transcendental VALU.
@@ -974,9 +1232,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
 
   // Does not have HW interlocs for VALU writing and then reading SGPRs.
-  bool hasVDecCoExecHazard() const { return GFX940Insts; }
+  bool hasVDecCoExecHazard() const {
+    return GFX940Insts;
+  }
 
-  bool hasNSAtoVMEMBug() const { return HasNSAtoVMEMBug; }
+  bool hasNSAtoVMEMBug() const {
+    return HasNSAtoVMEMBug;
+  }
 
   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
 
@@ -1046,7 +1308,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// Returns true if the target supports
   /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
   /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
-  bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
+  bool hasLDSLoadB96_B128() const {
+    return hasGFX950Insts();
+  }
 
   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
 
@@ -1077,11 +1341,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool hasPermlane32Swap() const { return HasPermlane32Swap; }
   bool hasAshrPkInsts() const { return HasAshrPkInsts; }
 
-  bool hasMinimum3Maximum3F32() const { return HasMinimum3Maximum3F32; }
+  bool hasMinimum3Maximum3F32() const {
+    return HasMinimum3Maximum3F32;
+  }
 
-  bool hasMinimum3Maximum3F16() const { return HasMinimum3Maximum3F16; }
+  bool hasMinimum3Maximum3F16() const {
+    return HasMinimum3Maximum3F16;
+  }
 
-  bool hasMinimum3Maximum3PKF16() const { return HasMinimum3Maximum3PKF16; }
+  bool hasMinimum3Maximum3PKF16() const {
+    return HasMinimum3Maximum3PKF16;
+  }
 
   /// \returns The maximum number of instructions that can be enclosed in an
   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
@@ -1098,9 +1368,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
-  /// Return occupancy for the given function. Used LDS and a number of
-  /// registers if provided.
-  /// Note, occupancy can be affected by the scratch allocation as well, but
+  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
+  /// be achieved when the only function running on a CU is \p F, each workgroup
+  /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
+  /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
+  /// range, so this returns a range as well.
+  ///
+  /// Note that occupancy can be affected by the scratch allocation as well, but
   /// we do not have enough information to compute it.
   std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
                                                  unsigned LDSSize = 0,
@@ -1128,7 +1402,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// \returns true if the machine has merged shaders in which s0-s7 are
   /// reserved by the hardware and user SGPRs start at s8
-  bool hasMergedShaders() const { return getGeneration() >= GFX9; }
+  bool hasMergedShaders() const {
+    return getGeneration() >= GFX9;
+  }
 
   // \returns true if the target supports the pre-NGG legacy geometry path.
   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
@@ -1286,7 +1562,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns the maximum number of ArchVGPRs that can be used and still
   /// achieve at least the specified number of waves \p WavesPerEU.
   unsigned getMaxNumArchVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+    return AMDGPU::IsaInfo::getMaxNumArchVGPRs(this, WavesPerEU);
   }
 
   /// \returns max num VGPRs. This is the common utility function called by
@@ -1311,7 +1587,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// function \p F.
   unsigned getMaxNumArchVGPRs(const Function &F) const;
 
-  unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
+  unsigned getMaxNumAGPRs(const Function &F) const {
+    return getMaxNumVGPRs(F);
+  }
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p MF, or number of VGPRs explicitly
@@ -1323,9 +1601,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// unit requirement.
   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 
-  bool isWave32() const { return getWavefrontSize() == 32; }
+  bool isWave32() const {
+    return getWavefrontSize() == 32;
+  }
 
-  bool isWave64() const { return getWavefrontSize() == 64; }
+  bool isWave64() const {
+    return getWavefrontSize() == 64;
+  }
 
   /// Returns if the wavesize of this subtarget is known reliable. This is false
   /// only for the a default target-cpu that does not have an explicit
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 8bb8ecb079a34..169cc0c51f4a4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2745,8 +2745,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
@@ -6333,8 +6333,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 3c0646c46efd0..2edfe67fcd56f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1026,23 +1026,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    ds_add_rtn_u32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB2_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -2806,19 +2807,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v7, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_add_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -4448,23 +4449,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    ds_sub_rtn_u32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB10_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -6251,19 +6253,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v7, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_sub_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -7195,8 +7197,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
@@ -7616,19 +7618,19 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -8230,23 +8232,24 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB17_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    ds_or_rtn_b32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB17_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -8559,8 +8562,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
@@ -8980,19 +8983,19 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -9594,23 +9597,24 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    ds_xor_rtn_b32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB19_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -9923,8 +9927,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
@@ -10344,19 +10348,19 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -11569,8 +11573,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_i64_e64 s8, s[0:1], s[6:7]
@@ -12177,19 +12181,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -13403,8 +13407,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_i64_e64 s8, s[0:1], s[6:7]
@@ -14011,19 +14015,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -14626,23 +14630,24 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    ds_max_rtn_u32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB27_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -15226,8 +15231,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -15830,19 +15835,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -17046,8 +17051,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -17648,19 +17653,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d61c4b46596c0..7c0ae8e726ad4 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -2084,15 +2084,14 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX11-SDAG-NEXT:  .LBB14_6: ; %bb.1
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX11-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
@@ -2100,16 +2099,16 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
 ; GFX11-SDAG-NEXT:  ; %bb.8:
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s8
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
@@ -2364,15 +2363,14 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX11-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
@@ -2380,13 +2378,14 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
 ; GFX11-SDAG-NEXT:  ; %bb.3:
 ; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s1, 5, s2
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:  .LBB15_4: ; %Flow
 ; GFX11-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 15be44a335a1d..2a17e5733f597 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -2585,17 +2585,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -2835,17 +2834,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -4641,17 +4639,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -4891,17 +4888,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -7932,16 +7928,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -8222,10 +8219,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -8233,6 +8229,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -8479,10 +8476,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -8490,6 +8486,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -9655,10 +9652,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -9666,6 +9662,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -9912,10 +9909,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -9923,6 +9919,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -13988,16 +13985,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index a4410bb9ed2d0..2892641017296 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -5353,16 +5353,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
@@ -5556,15 +5557,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1132-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
@@ -5722,15 +5723,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -8901,16 +8902,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 68d7dcc60506c..40cca6bfc63cb 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -5353,16 +5353,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
@@ -5556,15 +5557,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1132-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
@@ -5722,15 +5723,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -8901,16 +8902,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 7126680525b87..e2d58b80e01f7 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -195,17 +195,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
 ; GFX1132-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -396,17 +395,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
 ; GFX1132-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -1477,17 +1475,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -1727,17 +1724,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -2809,17 +2805,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -3059,17 +3054,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -4969,17 +4963,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -5219,17 +5212,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -8260,16 +8252,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -8550,10 +8543,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -8561,6 +8553,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -8807,10 +8800,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -8818,6 +8810,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -9982,10 +9975,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -9993,6 +9985,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -10239,10 +10232,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -10250,6 +10242,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -14314,16 +14307,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 5fab0c50bbe57..343780fde306f 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -565,22 +565,23 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ;
 ; GFX11-LABEL: srem32_invariant_denom:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_abs_i32 s2, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_abs_i32 s2, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_sub_i32 s3, 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_i32 s3, s3, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_hi_u32 s5, s4, s3
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_add_i32 s4, s4, s5
@@ -601,7 +602,6 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, 4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f72f1e52d135f..0317df6f6a044 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -463,7 +463,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-LABEL: divergent_value:
 ; GFX1132DAGISEL:       ; %bb.0: ; %entry
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
 ; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
@@ -476,7 +476,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132DAGISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 4551c60770bdf..5ec0224898bfd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -464,7 +464,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1132DAGISEL-LABEL: divergent_value:
 ; GFX1132DAGISEL:       ; %bb.0: ; %entry
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
@@ -477,7 +477,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132DAGISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index eb2d95e4db2d5..e16ac07236e30 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -38,14 +38,15 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %ad
 ; CHECK-LABEL: struct_atomic_buffer_load_i32_const_idx:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:  .LBB1_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
 ; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index bc50b12b59049..cc34020e0a28a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -38,14 +38,15 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrs
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:  .LBB1_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
 ; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index dd77e575e7505..506753f8640e1 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -100,7 +100,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
 ; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
 ; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:  .LBB2_2: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_wait_kmcnt 0x0
@@ -110,9 +110,9 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
-; GCN-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
-; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
+; GCN-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GCN-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
+; GCN-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  .LBB2_3: ; %for.end
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index d62f045674ace..bf1ab9a07d5f6 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -54,7 +54,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ;
 ; GFX12-LABEL: _amdgpu_cs_main:
 ; GFX12:       ; %bb.0: ; %branch1_true
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_mov_b32 s4, 0
@@ -77,6 +76,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX12-NEXT:    s_cbranch_execz .LBB0_1
 ; GFX12-NEXT:  ; %bb.3: ; %branch2_merge
 ; GFX12-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_mov_b32 s5, s4
 ; GFX12-NEXT:    s_mov_b32 s6, s4
 ; GFX12-NEXT:    s_mov_b32 s7, s4
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 538ce15979de8..ffa0cf5b9b3e0 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x22
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
-; CHECK-NEXT:        .vgpr_count:     0x3
+; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      multiple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x24
 ; CHECK-NEXT:        .lds_size:       0

>From 9f4968127217ba277d7ced9272a84e3ad980ae7d Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Sun, 9 Feb 2025 18:45:49 +0100
Subject: [PATCH 3/8] Fix broken unit test

---
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 343780fde306f..5fab0c50bbe57 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -565,23 +565,22 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ;
 ; GFX11-LABEL: srem32_invariant_denom:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_abs_i32 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_abs_i32 s2, s0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_sub_i32 s3, 0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_i32 s3, s3, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_hi_u32 s5, s4, s3
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_add_i32 s4, s4, s5
@@ -602,6 +601,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, 4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, 0

>From daf47238162a0f6e5465725cd24001b1854d46b9 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Mon, 10 Feb 2025 13:43:44 +0000
Subject: [PATCH 4/8] Reintroduce missing tests and fix existing one

---
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |  123 +-
 .../machine-scheduler-sink-trivial-remats.mir | 1260 +++++++++++++++++
 2 files changed, 1322 insertions(+), 61 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 5fab0c50bbe57..c71d080b066b0 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -565,22 +565,23 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ;
 ; GFX11-LABEL: srem32_invariant_denom:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_abs_i32 s2, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_abs_i32 s2, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_sub_i32 s3, 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_i32 s3, s3, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_hi_u32 s5, s4, s3
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_add_i32 s4, s4, s5
@@ -601,7 +602,6 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, 4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, 0
@@ -694,31 +694,32 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB4_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    s_and_b32 s3, 0xffff, s2
 ; GFX11-NEXT:    s_add_i32 s2, s2, 1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v4, s3
+; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_cmpk_eq_i32 s3, 0x400
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT:    v_fma_f32 v2, -v1, v0, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT:    global_store_b16 v4, v2, s[0:1]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    global_store_b16 v3, v1, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB4_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
@@ -812,33 +813,34 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB5_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s3
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s4
 ; GFX11-NEXT:    s_lshl_b32 s5, s4, 1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX11-NEXT:    v_fma_f32 v2, -v1, v0, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v3, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_lo_u32 v1, v1, s2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
 ; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT:    global_store_b16 v2, v1, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB5_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
@@ -940,38 +942,37 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sext_i32_i16 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB6_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    s_sext_i32_i16 s4, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX11-NEXT:    s_xor_b32 s4, s4, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 s4, s4, 1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX11-NEXT:    v_fma_f32 v2, -v1, v0, v2
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s5, |v2|, |v0|
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_and_b32 s5, s5, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s4, s4, 0
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, s4, v2
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
-; GFX11-NEXT:    v_mov_b32_e32 v3, s5
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_add_nc_u32 v1, s4, v1
 ; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT:    global_store_b16 v2, v1, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB6_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
@@ -1077,42 +1078,42 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sext_i32_i16 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    s_sext_i32_i16 s4, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX11-NEXT:    s_xor_b32 s5, s4, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_ashr_i32 s5, s5, 30
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 s5, s5, 1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX11-NEXT:    v_fma_f32 v2, -v1, v0, v2
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v2|, |v0|
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, s5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, s5, v1
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s5
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11-NEXT:    v_mul_lo_u32 v1, v1, s2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
 ; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT:    global_store_b16 v2, v1, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB7_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index e5a68dd2e26ef..c3c3cb0bf94ec 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -7446,6 +7446,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_8]], implicit [[V_CVT_I32_F32_e32_16]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_26]]
@@ -7544,6 +7545,7 @@ body:             |
     S_NOP 0, implicit %59, implicit %69
     S_NOP 0, implicit %70, implicit %71
     S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
     S_NOP 0, implicit %74, implicit %75
     S_NOP 0, implicit %76, implicit %77
     S_NOP 0, implicit %78, implicit %79
@@ -7551,4 +7553,1262 @@ body:             |
     $exec = S_MOV_B64 %101:sreg_64_xexec
     S_ENDPGM 0
 ...
+---
+name:            remat_virtual_vgpr_occ_6
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: remat_virtual_vgpr_occ_6
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF22]], implicit [[DEF27]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_22]], implicit [[DEF23]], implicit [[DEF28]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_23]], implicit [[DEF24]], implicit [[DEF29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF25]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF26]], implicit [[DEF31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %2:vgpr_32(s32) = COPY $vgpr0
+    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
+    undef %5.sub1:sreg_64 = S_MOV_B32 0
+    %5.sub0:sreg_64 = COPY %3.sub1
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
+    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
+    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+    S_NOP 0, implicit %55, implicit %65
+    S_NOP 0, implicit %56, implicit %66
+    S_NOP 0, implicit %57, implicit %67
+    S_NOP 0, implicit %58, implicit %68
+    S_NOP 0, implicit %59, implicit %69
+    S_NOP 0, implicit %70, implicit %71
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %74, implicit %75
+    S_NOP 0, implicit %76, implicit %77
+    S_NOP 0, implicit %78, implicit %79
+    S_NOP 0, implicit %80
+    S_ENDPGM 0
+...
+
+---
+name:            remat_virtual_vgpr_uses_not_available
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: remat_virtual_vgpr_uses_not_available
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF22]], implicit [[DEF27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF23]], implicit [[DEF28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF24]], implicit [[DEF29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
 
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %2:vgpr_32(s32) = COPY $vgpr0
+    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
+    undef %5.sub1:sreg_64 = S_MOV_B32 0
+    %5.sub0:sreg_64 = COPY %3.sub1
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_NOP 0, implicit %55, implicit %65
+    S_NOP 0, implicit %50, implicit %60, implicit %11, implicit %21
+    S_NOP 0, implicit %51, implicit %61, implicit %12, implicit %22
+    S_NOP 0, implicit %52, implicit %62, implicit %13, implicit %23
+    S_NOP 0, implicit %53, implicit %63, implicit %14, implicit %24
+    S_NOP 0, implicit %54, implicit %64, implicit %15, implicit %25
+    S_NOP 0, implicit %56, implicit %66
+    S_NOP 0, implicit %57, implicit %67
+    S_NOP 0, implicit %58, implicit %68
+    S_NOP 0, implicit %59, implicit %69
+    S_NOP 0, implicit %70, implicit %71
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %74, implicit %75
+    S_NOP 0, implicit %76, implicit %77
+    S_NOP 0, implicit %78, implicit %79
+    S_NOP 0, implicit %80
+    S_ENDPGM 0
+...
+
+---
+name:            remat_virtual_vgpr_subreg_occ_6
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: remat_virtual_vgpr_subreg_occ_6
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[DEF:%[0-9]+]].sub0:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub2:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub3:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_27]], implicit [[V_CVT_I32_F32_e32_22]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_23]], implicit [[DEF28]], implicit [[DEF24]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF29]], implicit [[DEF25]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF30]], implicit [[DEF26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF31]], implicit [[DEF27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %2:vgpr_32(s32) = COPY $vgpr0
+    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
+    undef %5.sub1:sreg_64 = S_MOV_B32 0
+    %5.sub0:sreg_64 = COPY %3.sub1
+    undef %10.sub0:vreg_512 = IMPLICIT_DEF
+    %10.sub1:vreg_512 = IMPLICIT_DEF
+    %10.sub2:vreg_512 = IMPLICIT_DEF
+    %10.sub3:vreg_512 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub0, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub2, implicit $exec, implicit $mode
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_NOP 0, implicit %50, implicit %60, implicit %10.sub0, implicit %10.sub2
+    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
+    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+    S_NOP 0, implicit %55, implicit %65
+    S_NOP 0, implicit %56, implicit %66
+    S_NOP 0, implicit %57, implicit %67
+    S_NOP 0, implicit %58, implicit %68
+    S_NOP 0, implicit %59, implicit %69
+    S_NOP 0, implicit %70, implicit %71
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %74, implicit %75
+    S_NOP 0, implicit %76, implicit %77
+    S_NOP 0, implicit %78, implicit %79
+    S_NOP 0, implicit %80
+    S_ENDPGM 0
+...
+
+---
+name:            remat_virtual_vgpr_subreg_uses_not_available
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: remat_virtual_vgpr_subreg_uses_not_available
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[DEF:%[0-9]+]].sub0:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub2:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub3:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF]].sub1, implicit [[DEF]].sub3
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF24]], implicit [[DEF28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF27]], implicit [[DEF31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[V_CVT_I32_F32_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %2:vgpr_32(s32) = COPY $vgpr0
+    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
+    undef %5.sub1:sreg_64 = S_MOV_B32 0
+    %5.sub0:sreg_64 = COPY %3.sub1
+    undef %10.sub0:vreg_512 = IMPLICIT_DEF
+    %10.sub1:vreg_512 = IMPLICIT_DEF
+    %10.sub2:vreg_512 = IMPLICIT_DEF
+    %10.sub3:vreg_512 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub0, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub2, implicit $exec, implicit $mode
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_NOP 0, implicit %50, implicit %60, implicit %10.sub1, implicit %10.sub3
+    S_NOP 0, implicit %51, implicit %61, implicit %12, implicit %22
+    S_NOP 0, implicit %52, implicit %62, implicit %13, implicit %23
+    S_NOP 0, implicit %53, implicit %63, implicit %14, implicit %24
+    S_NOP 0, implicit %54, implicit %64, implicit %15, implicit %25
+    S_NOP 0, implicit %55, implicit %65
+    S_NOP 0, implicit %56, implicit %66
+    S_NOP 0, implicit %57, implicit %67
+    S_NOP 0, implicit %58, implicit %68
+    S_NOP 0, implicit %59, implicit %69
+    S_NOP 0, implicit %70, implicit %71
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %74, implicit %75
+    S_NOP 0, implicit %76, implicit %77
+    S_NOP 0, implicit %78, implicit %79
+    S_NOP 0, implicit %80
+    S_ENDPGM 0
+...
+
+
+---
+name:            remat_virtual_vgpr_subreg_def
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: remat_virtual_vgpr_subreg_def
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[DEF:%[0-9]+]].sub0:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub2:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]].sub3:vreg_512 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F32_e32_1:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F32_e32_1:%[0-9]+]].sub0:vreg_64 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]].sub0, implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF1]], implicit [[DEF28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF25]], implicit [[DEF29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF26]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF27]], implicit [[DEF31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_11]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_12]], implicit [[V_CVT_I32_F32_e32_13]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_14]], implicit [[V_CVT_I32_F32_e32_15]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_14]], implicit [[V_CVT_I32_F32_e32_15]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_16]], implicit [[V_CVT_I32_F32_e32_17]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_18]], implicit [[V_CVT_I32_F32_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_20]], implicit [[V_CVT_I32_F32_e32_21]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %2:vgpr_32(s32) = COPY $vgpr0
+    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
+    undef %5.sub1:sreg_64 = S_MOV_B32 0
+    %5.sub0:sreg_64 = COPY %3.sub1
+    undef %10.sub0:vreg_512 = IMPLICIT_DEF
+    %10.sub1:vreg_512 = IMPLICIT_DEF
+    %10.sub2:vreg_512 = IMPLICIT_DEF
+    %10.sub3:vreg_512 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    undef %50.sub0:vreg_64 = nofpexcept V_CVT_I32_F32_e32 %10.sub0, implicit $exec, implicit $mode
+    %50.sub1:vreg_64 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10.sub2, implicit $exec, implicit $mode
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_NOP 0, implicit %50.sub0, implicit %60, implicit %10.sub0, implicit %10.sub2
+    S_NOP 0, implicit %52, implicit %61, implicit %11, implicit %21
+    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+    S_NOP 0, implicit %55, implicit %65
+    S_NOP 0, implicit %56, implicit %66
+    S_NOP 0, implicit %57, implicit %67
+    S_NOP 0, implicit %58, implicit %68
+    S_NOP 0, implicit %59, implicit %69
+    S_NOP 0, implicit %70, implicit %71
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %74, implicit %75
+    S_NOP 0, implicit %76, implicit %77
+    S_NOP 0, implicit %78, implicit %79
+    S_NOP 0, implicit %80
+    S_ENDPGM 0
+...
+
+# The remat candidate (instruction defining %3) uses a register (%2) which does not have a subrange for every subregister (sub1 is undefined).
+# Be sure that when checking if we can rematerialize %3, that we handle the liveness checking of %2 properly.
+
+---
+name:            omitted_subrange
+tracksRegLiveness: true
+body:             |
+  ; GFX908-LABEL: name: omitted_subrange
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; GFX908-NEXT:   undef [[V_RCP_F32_e32_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 [[DEF22]].sub0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[DEF33:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF34:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   %temp:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr3, $sgpr4, implicit-def $scc
+  ; GFX908-NEXT:   [[DEF34:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, [[DEF22]].sub0, 1, %temp, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; GFX908-NEXT:   S_BRANCH %bb.3
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2:
+  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   undef [[V_FMA_F32_e64_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, [[DEF34]].sub1, 0, [[V_RCP_F32_e32_]].sub0, 0, [[DEF34]].sub0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   %temp2:vreg_64_align2 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, [[V_RCP_F32_e32_]], 8, [[DEF22]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   dead [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, [[V_FMA_F32_e64_]], 8, %temp2, 11, [[V_PK_MUL_F32_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_23]], implicit [[DEF23]], implicit [[DEF28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF24]], implicit [[DEF29]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF32]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   dead [[DEF35:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF25]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF27]], implicit [[DEF32]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+  ; GFX908-NEXT:   S_BRANCH %bb.3
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.3:
+  ; GFX908-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
+  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
+  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+  ; GFX908-NEXT:   S_BRANCH %bb.4
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.4:
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $sgpr3, $sgpr4
+
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    undef %2.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 %1.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %3:vreg_64_align2 =  nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, %2, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %5:vreg_64_align2 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+    S_BRANCH %bb.1
+
+  bb.1:
+    liveins: $sgpr3, $sgpr4
+
+    %temp:vgpr_32 = IMPLICIT_DEF
+    %5.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, %1.sub0:vreg_64_align2, 1, %temp, 0, 0, implicit $mode, implicit $exec
+
+    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+    S_BRANCH %bb.4
+
+  bb.3:
+    liveins: $sgpr3, $sgpr4
+
+    %6:vreg_64_align2 = IMPLICIT_DEF
+    undef %7.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, %5.sub1, 0, %2.sub0:vreg_64_align2, 0, %5.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %temp2:vreg_64_align2 = IMPLICIT_DEF
+    %8:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, %7:vreg_64_align2, 8, %temp2, 11, %3:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
+    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
+    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+    S_NOP 0, implicit %55, implicit %65, implicit %1
+    S_NOP 0, implicit %56, implicit %66
+    S_NOP 0, implicit %57, implicit %67
+    S_NOP 0, implicit %58, implicit %68
+    S_NOP 0, implicit %59, implicit %69
+    S_NOP 0, implicit %70, implicit %71
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %74, implicit %75
+    S_NOP 0, implicit %76, implicit %77
+    S_NOP 0, implicit %78, implicit %79
+    S_NOP 0, implicit %80
+    S_BRANCH %bb.4
+
+  bb.4:
+    liveins: $sgpr3, $sgpr4
+
+    S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...

>From 6e0e80e6f6570ae227fceffb4f983c346bf24e80 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Mon, 17 Feb 2025 12:24:58 +0000
Subject: [PATCH 5/8] Address feedback

- Properly account for AGPRs in excess RP calculations.
- Account for SGPR spilling when deciding whether to try to increase
  occupancy.
- Don't rollback when rescheduling managed to improve occuoancy after
  remats.
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 157 +++--
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |   7 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       |  20 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  28 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  40 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  15 -
 .../atomic_optimizations_global_pointer.ll    |   4 +-
 .../atomic_optimizations_local_pointer.ll     | 141 +++--
 .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll |  41 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        |  50 +-
 .../AMDGPU/global_atomics_scan_fmax.ll        |  22 +-
 .../AMDGPU/global_atomics_scan_fmin.ll        |  22 +-
 .../AMDGPU/global_atomics_scan_fsub.ll        |  70 ++-
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         | 123 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll |   4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll |   4 +-
 .../llvm.amdgcn.struct.atomic.buffer.load.ll  |   7 +-
 ...vm.amdgcn.struct.ptr.atomic.buffer.load.ll |   7 +-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll |   8 +-
 ...ine-scheduler-sink-trivial-remats-attr.mir | 555 +++++++++++++++++-
 .../CodeGen/AMDGPU/no-dup-inst-prefetch.ll    |   2 +-
 .../AMDGPU/pal-metadata-3.0-callable.ll       |   2 +-
 22 files changed, 944 insertions(+), 385 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fbf1f9dae03a5..8f5b96fb788d8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1675,6 +1675,41 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
   return true;
 }
 
+bool PreRARematStage::hasExcessVGPRs(const GCNRegPressure &RP,
+                                     unsigned MaxVGPRs,
+                                     unsigned &ExcessArchVGPRs,
+                                     bool &AGPRLimited) {
+  unsigned NumAGPRs = RP.getAGPRNum();
+  if (!ST.hasGFX90AInsts() || !NumAGPRs) {
+    // Non-unified RF. We can only reduce ArchVGPR excess pressure at this
+    // point, but still want to identify when there is AGPR excess pressure.
+    bool HasSpill = false;
+    unsigned NumArchVGPRs = RP.getArchVGPRNum();
+    if (NumArchVGPRs > MaxVGPRs) {
+      ExcessArchVGPRs = NumArchVGPRs - MaxVGPRs;
+      HasSpill = true;
+    }
+    if (NumAGPRs > MaxVGPRs) {
+      AGPRLimited = true;
+      HasSpill = true;
+    }
+    return HasSpill;
+  }
+  if (RP.getVGPRNum(true) > MaxVGPRs) {
+    // Unified RF. We can only remat ArchVGPRs; AGPR pressure alone may prevent
+    // us from eliminating spilling.
+    unsigned NumArchVGPRs = RP.getArchVGPRNum();
+    if (NumAGPRs >= MaxVGPRs) {
+      AGPRLimited = true;
+      ExcessArchVGPRs = NumArchVGPRs;
+    } else {
+      ExcessArchVGPRs = NumArchVGPRs - alignDown(MaxVGPRs - NumAGPRs, 4);
+    }
+    return true;
+  }
+  return false;
+}
+
 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
 
@@ -1684,51 +1719,86 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
   // occupancy, or regions with VGPR spilling) to their excess RP.
   DenseMap<unsigned, unsigned> OptRegions;
-
-  // Note that the maximum number of VGPRs to use to eliminate spill may be
-  // lower than the maximum number to increase occupancy when the function has
-  // the "amdgpu-num-vgpr" attribute.
-  const std::pair<unsigned, unsigned> OccBounds =
+  const Function &F = MF.getFunction();
+  const bool UnifiedRF = ST.hasGFX90AInsts();
+
+  // Adjust workgroup size induced occupancy bounds with the
+  // "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
+  // method, but at this point is if unclear how other parts of the codebase
+  // interpret this attribute and the default behavior produces unexpected
+  // bounds. Here we want to allow users to ask for target occupancies lower
+  // than the default lower bound.
+  std::pair<unsigned, unsigned> OccBounds =
       ST.getOccupancyWithWorkGroupSizes(MF);
-  // FIXME: we should be able to just call ST.getMaxNumArchVGPRs() but that
-  // would use the occupancy bounds as determined by
-  // MF.getFunction().getWavesPerEU(), which look incorrect in some cases.
+  std::pair<unsigned, unsigned> WavesPerEU =
+      AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
+  if (WavesPerEU.first <= WavesPerEU.second) {
+    if (WavesPerEU.first && WavesPerEU.first <= OccBounds.second)
+      OccBounds.first = WavesPerEU.first;
+    if (WavesPerEU.second)
+      OccBounds.second = std::min(OccBounds.second, WavesPerEU.second);
+  }
+
+  // We call the "base max functions" directly because otherwise it uses the
+  // subtarget's logic for combining "amdgpu-waves-per-eu" with the function's
+  // groupsize induced occupancy bounds, producing unexpected results.
+  const unsigned MaxSGPRsNoSpill = ST.getBaseMaxNumSGPRs(
+      F, OccBounds, ST.getMaxNumPreloadedSGPRs(), ST.getReservedNumSGPRs(F));
   const unsigned MaxVGPRsNoSpill =
-      ST.getBaseMaxNumVGPRs(MF.getFunction(),
-                            {ST.getMinNumArchVGPRs(OccBounds.second),
-                             ST.getMaxNumArchVGPRs(OccBounds.first)},
-                            false);
-  const unsigned MaxVGPRsIncOcc = ST.getMaxNumArchVGPRs(DAG.MinOccupancy + 1);
+      ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(OccBounds.second),
+                                ST.getMaxNumVGPRs(OccBounds.first)});
+  const unsigned MaxSGPRsMinOcc = ST.getMaxNumSGPRs(DAG.MinOccupancy, false);
+  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
   IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy;
 
+  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
+    if (Cond) {
+      // We won't try to increase occupancy.
+      IncreaseOccupancy = false;
+      OptRegions.clear();
+    }
+    return Cond;
+  };
+
   // Collect optimizable regions. If there is spilling in any region we will
-  // just try to reduce it. Otherwise we will try to increase occupancy by one.
+  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
+  // occupancy by one in the whole function.
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
     GCNRegPressure &RP = DAG.Pressure[I];
-    unsigned NumVGPRs = RP.getArchVGPRNum();
     unsigned ExcessRP = 0;
-    if (NumVGPRs > MaxVGPRsNoSpill) {
-      if (IncreaseOccupancy) {
-        // We won't try to increase occupancy.
-        IncreaseOccupancy = false;
-        OptRegions.clear();
-      }
-      // Region has VGPR spilling, we will try to reduce spilling as much as
-      // possible.
-      ExcessRP = NumVGPRs - MaxVGPRsNoSpill;
-      REMAT_DEBUG(dbgs() << "Region " << I << " is spilling VGPRs, save "
-                         << ExcessRP << " VGPR(s) to eliminate spilling\n");
+    unsigned NumSGPRs = RP.getSGPRNum();
+
+    // Check whether SGPR pressures prevents us from eliminating spilling.
+    if (NumSGPRs > MaxSGPRsNoSpill)
+      ClearOptRegionsIf(IncreaseOccupancy);
+
+    bool OccAGPRLimited = false;
+    if (hasExcessVGPRs(RP, MaxVGPRsNoSpill, ExcessRP, OccAGPRLimited)) {
+      ClearOptRegionsIf(IncreaseOccupancy);
+      REMAT_DEBUG({
+        if (ExcessRP) {
+          StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
+          dbgs() << "Region " << I << " is spilling " << RegClass << ", save "
+                 << ExcessRP << " to eliminate " << RegClass << "-spilling\n";
+        }
+      });
     } else if (IncreaseOccupancy) {
-      if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy) {
-        // Occupancy is SGPR-limited in the region, no point in trying to
-        // increase it through VGPR usage.
-        IncreaseOccupancy = false;
-        OptRegions.clear();
-      } else if (NumVGPRs > MaxVGPRsIncOcc) {
-        // Occupancy is VGPR-limited.
-        ExcessRP = NumVGPRs - MaxVGPRsIncOcc;
-        REMAT_DEBUG(dbgs() << "Region " << I << " has min. occupancy: save "
-                           << ExcessRP << " VGPR(s) to improve occupancy\n");
+      // Check whether SGPR pressure prevents us from increasing occupancy.
+      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsMinOcc))
+        continue;
+
+      if (hasExcessVGPRs(RP, MaxVGPRsIncOcc, ExcessRP, OccAGPRLimited)) {
+        // Check whether AGPR pressure prevents us from increasing occupancy.
+        if (ClearOptRegionsIf(OccAGPRLimited))
+          continue;
+        // Occupancy could be increased by rematerializing ArchVGPRs.
+        REMAT_DEBUG({
+          if (ExcessRP) {
+            StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
+            dbgs() << "Region " << I << " has min. occupancy: save " << ExcessRP
+                   << " " << RegClass << " to improve occupancy\n";
+          }
+        });
       }
     }
     if (ExcessRP)
@@ -1738,7 +1808,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
     return false;
 
   // When we are reducing spilling, the target is the minimum achievable
-  // occupancy implied by workgroup sizes.
+  // occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu" attribute.
   TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first;
 
   // Accounts for a reduction in RP in an optimizable region. Returns whether we
@@ -2058,9 +2128,11 @@ static MachineBasicBlock *getRegionMBB(MachineFunction &MF,
 void PreRARematStage::finalizeGCNSchedStage() {
   // We consider that reducing spilling is always beneficial so we never
   // rollback rematerializations in such cases. It's also possible that
-  // rescheduling lowers occupancy over the one achived just through remats, in
-  // which case we do not want to rollback either.
-  if (!IncreaseOccupancy || AchievedOcc == TargetOcc)
+  // rescheduling lowers occupancy over the one achieved just through remats, in
+  // which case we do not want to rollback either (the rescheduling was already
+  // reverted in PreRARematStage::shouldRevertScheduling in such cases).
+  unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
+  if (!IncreaseOccupancy || MaxOcc == TargetOcc)
     return;
 
   REMAT_DEBUG(dbgs() << "Rollbacking all rematerializations\n");
@@ -2077,10 +2149,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
 
     // Re-rematerialize MI at the end of its original region. Note that it may
     // not be rematerialized exactly in the same position as originally within
-    // the region, but it should not matter much. Since we are only
-    // rematerializing instructions that do not have any virtual reg uses, we
-    // do not need to call LiveRangeEdit::allUsesAvailableAt() and
-    // LiveRangeEdit::canRematerializeAt().
+    // the region, but it should not matter much.
     TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
     MachineInstr *NewMI = &*std::prev(InsertPos);
     NewMI->getOperand(0).setSubReg(SubReg);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 2d6440b1b0730..9fd40d8e5e64f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -485,6 +485,13 @@ class PreRARematStage : public GCNSchedStage {
   /// spilling.
   bool IncreaseOccupancy;
 
+  /// Determines whether there is excess VGPR pressure in \p RP w.r.t. \p
+  /// MaxVGPRs. If there is, \p ExcessArchVGPRs is set to the number of
+  /// ArchVGPRs one must save to eliminate the excess and \p AGPRLimited is set
+  /// to true if AGPR pressure alone causes an excess.
+  bool hasExcessVGPRs(const GCNRegPressure &RP, unsigned MaxVGPRs,
+                      unsigned &ExcessArchVGPRs, bool &AGPRLimited);
+
   /// Returns whether remat can reduce spilling or increase function occupancy
   /// by 1 through rematerialization. If it can do one, collects instructions in
   /// PreRARematStage::Rematerializations and sets the target occupancy in
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 0507b64c46192..6fb7d03a71a85 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -466,7 +466,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
                             getReservedNumSGPRs(MF));
 }
 
-static unsigned getMaxNumPreloadedSGPRs() {
+unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
   using USI = GCNUserSGPRUsageInfo;
   // Max number of user SGPRs
   const unsigned MaxUserSGPRs =
@@ -496,10 +496,8 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
                             getReservedNumSGPRs(F));
 }
 
-unsigned
-GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
-                                 std::pair<unsigned, unsigned> NumVGPRBounds,
-                                 bool UnifiedRF) const {
+unsigned GCNSubtarget::getBaseMaxNumVGPRs(
+    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
   const auto &[Min, Max] = NumVGPRBounds;
 
   // Check if maximum number of VGPRs was explicitly requested using
@@ -510,7 +508,7 @@ GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
   unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
   if (!Requested)
     return Max;
-  if (UnifiedRF)
+  if (hasGFX90AInsts())
     Requested *= 2;
 
   // Make sure requested value is inside the range of possible VGPR usage.
@@ -520,15 +518,7 @@ GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
   std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
   return getBaseMaxNumVGPRs(
-      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)},
-      hasGFX90AInsts());
-}
-
-unsigned GCNSubtarget::getMaxNumArchVGPRs(const Function &F) const {
-  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
-  return getBaseMaxNumVGPRs(
-      F, {getMinNumArchVGPRs(Waves.second), getMaxNumArchVGPRs(Waves.first)},
-      false);
+      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
 }
 
 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8345a498045f5..92c6de8f3d7ad 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1487,6 +1487,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns Reserved number of SGPRs for given function \p F.
   unsigned getReservedNumSGPRs(const Function &F) const;
 
+  /// \returns Maximum number of preloaded SGPRs for the subtarget.
+  unsigned getMaxNumPreloadedSGPRs() const;
+
   /// \returns max num SGPRs. This is the common utility
   /// function called by MachineFunction and Function
   /// variants of getMaxNumSGPRs.
@@ -1547,29 +1550,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
   }
 
-  /// \returns the minimum number of ArchVGPRs that will prevent achieving more
-  /// than the specified number of waves \p WavesPerEU.
-  unsigned getMinNumArchVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumArchVGPRs(this, WavesPerEU);
-  }
-
   /// \returns the maximum number of VGPRs that can be used and still achieved
   /// at least the specified number of waves \p WavesPerEU.
   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
   }
 
-  /// \returns the maximum number of ArchVGPRs that can be used and still
-  /// achieve at least the specified number of waves \p WavesPerEU.
-  unsigned getMaxNumArchVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumArchVGPRs(this, WavesPerEU);
-  }
-
-  /// \returns max num VGPRs. This is the common utility function called by
-  /// MachineFunction and Function variants of getMaxNum[Arch]VGPRs.
+  /// \returns max num VGPRs. This is the common utility function
+  /// called by MachineFunction and Function variants of getMaxNumVGPRs.
   unsigned getBaseMaxNumVGPRs(const Function &F,
-                              std::pair<unsigned, unsigned> NumVGPRBounds,
-                              bool UnifiedRF) const;
+                              std::pair<unsigned, unsigned> NumVGPRBounds) const;
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p F, or number of VGPRs explicitly
@@ -1581,12 +1571,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// unit requirement.
   unsigned getMaxNumVGPRs(const Function &F) const;
 
-  /// Returns the maximum number of ArchVGPRs that meets number of waves per
-  /// execution unit requirement for function \p F, or number of ArchVGPRs
-  /// explicitly requested using "amdgpu-num-vgpr" attribute attached to
-  /// function \p F.
-  unsigned getMaxNumArchVGPRs(const Function &F) const;
-
   unsigned getMaxNumAGPRs(const Function &F) const {
     return getMaxNumVGPRs(F);
   }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5f229b310f686..59afcbed35294 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1240,36 +1240,28 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
   return 5;
 }
 
-unsigned getBaseMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
-                            unsigned TotalNumVGPRs,
-                            unsigned NumAddressableVGPRs) {
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
+
   unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
   if (WavesPerEU >= MaxWavesPerEU)
     return 0;
-  unsigned MinWavesPerEU =
-      getNumWavesPerEUWithNumVGPRs(STI, NumAddressableVGPRs);
-  if (WavesPerEU < MinWavesPerEU)
-    return getMinNumVGPRs(STI, MinWavesPerEU);
 
+  unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
+  unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
   unsigned Granule = getVGPRAllocGranule(STI);
-  unsigned MaxNumVGPRs = alignDown(TotalNumVGPRs / WavesPerEU, Granule);
-  if (MaxNumVGPRs == alignDown(TotalNumVGPRs / MaxWavesPerEU, Granule))
+  unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
+
+  if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
     return 0;
-  unsigned MaxNumVGPRsNext =
-      alignDown(TotalNumVGPRs / (WavesPerEU + 1), Granule);
-  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
-  return std::min(MinNumVGPRs, NumAddressableVGPRs);
-}
 
-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
-  return getBaseMinNumVGPRs(STI, WavesPerEU, getTotalNumVGPRs(STI),
-                            getAddressableNumVGPRs(STI));
-}
+  unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
+  if (WavesPerEU < MinWavesPerEU)
+    return getMinNumVGPRs(STI, MinWavesPerEU);
 
-unsigned getMinNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
-  unsigned TotNumArchVGPRs = getAddressableNumArchVGPRs(STI);
-  return getBaseMinNumVGPRs(STI, WavesPerEU, TotNumArchVGPRs, TotNumArchVGPRs);
+  unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
+  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
+  return std::min(MinNumVGPRs, AddrsableNumVGPRs);
 }
 
 unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
@@ -1281,12 +1273,6 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
 
-unsigned getMaxNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
-  assert(WavesPerEU != 0);
-  return alignDown(getAddressableNumArchVGPRs(STI) / WavesPerEU,
-                   getVGPRAllocGranule(STI));
-}
-
 unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
                                  std::optional<bool> EnableWavefrontSize32) {
   return getGranulatedNumRegisterBlocks(
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ebcbd0c07506b..67bebfb3418d5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -318,29 +318,14 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
 /// \returns Addressable number of VGPRs for given subtarget \p STI.
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
 
-/// Returns the minimum number of VGPRs that meets given number of waves per
-/// execution unit requirement for given subtarget \p STI with \p TotNumVGPRs
-/// VGPRs available and \p NumAddressableVGPRs addressable VGPRs.
-unsigned getBaseMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
-                            unsigned TotalNumVGPRs,
-                            unsigned NumAddressableVGPRs);
-
 /// \returns Minimum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
 unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
-/// \returns Minimum number of ArchVGPRs that meets given number of waves per
-/// execution unit requirement for given subtarget \p STI.
-unsigned getMinNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
-
 /// \returns Maximum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
 unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
-/// \returns Maximum number of ArchVGPRs that meets given number of waves per
-/// execution unit requirement for given subtarget \p STI.
-unsigned getMaxNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
-
 /// \returns Number of waves reachable for a given \p NumVGPRs usage for given
 /// subtarget \p STI.
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 169cc0c51f4a4..8bb8ecb079a34 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2745,8 +2745,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
@@ -6333,8 +6333,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 2edfe67fcd56f..3c0646c46efd0 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1026,24 +1026,23 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1132_DPP-NEXT:    ds_add_rtn_u32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB2_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -2807,19 +2806,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v7, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_add_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -4449,24 +4448,23 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1132_DPP-NEXT:    ds_sub_rtn_u32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB10_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -6253,19 +6251,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v7, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_sub_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -7197,8 +7195,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
@@ -7618,19 +7616,19 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -8232,24 +8230,23 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB17_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1132_DPP-NEXT:    ds_or_rtn_b32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB17_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -8562,8 +8559,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
@@ -8983,19 +8980,19 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -9597,24 +9594,23 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1132_DPP-NEXT:    ds_xor_rtn_b32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB19_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -9927,8 +9923,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
@@ -10348,19 +10344,19 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -11573,8 +11569,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_i64_e64 s8, s[0:1], s[6:7]
@@ -12181,19 +12177,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -13407,8 +13403,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_i64_e64 s8, s[0:1], s[6:7]
@@ -14015,19 +14011,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -14630,24 +14626,23 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132_DPP-NEXT:    s_mov_b32 s1, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1132_DPP-NEXT:    ds_max_rtn_u32 v0, v4, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB27_2:
-; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
@@ -15231,8 +15226,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -15835,19 +15830,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
@@ -17051,8 +17046,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -17653,19 +17648,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 7c0ae8e726ad4..d61c4b46596c0 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -2084,14 +2084,15 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX11-SDAG-NEXT:  .LBB14_6: ; %bb.1
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GFX11-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v1, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
@@ -2099,16 +2100,16 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
 ; GFX11-SDAG-NEXT:  ; %bb.8:
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33 dlc
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s1 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s8
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
@@ -2363,14 +2364,15 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GFX11-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v1, s3
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
@@ -2378,14 +2380,13 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
 ; GFX11-SDAG-NEXT:  ; %bb.3:
 ; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
 ; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s1, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX11-SDAG-NEXT:  .LBB15_4: ; %Flow
 ; GFX11-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 2a17e5733f597..15be44a335a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -2585,16 +2585,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -2834,16 +2835,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -4639,16 +4641,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -4888,16 +4891,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_add_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -7928,17 +7932,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -8219,9 +8222,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -8229,7 +8233,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -8476,9 +8479,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -8486,7 +8490,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -9652,9 +9655,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -9662,7 +9666,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -9909,9 +9912,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -9919,7 +9923,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -13985,17 +13988,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 2892641017296..a4410bb9ed2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -5353,17 +5353,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
@@ -5557,15 +5556,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1132-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
@@ -5723,15 +5722,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -8902,17 +8901,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 40cca6bfc63cb..68d7dcc60506c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -5353,17 +5353,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
@@ -5557,15 +5556,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1132-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
@@ -5723,15 +5722,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
@@ -8902,17 +8901,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index e2d58b80e01f7..7126680525b87 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -195,16 +195,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
 ; GFX1132-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -395,16 +396,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
 ; GFX1132-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -1475,16 +1477,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -1724,16 +1727,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -2805,16 +2809,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -3054,16 +3059,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -4963,16 +4969,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -5212,16 +5219,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_sub_f32 v0, v1, v2 :: v_dual_mov_b32 v3, 0
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -8252,17 +8260,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -8543,9 +8550,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -8553,7 +8561,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -8800,9 +8807,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -8810,7 +8818,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -9975,9 +9982,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -9985,7 +9993,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -10232,9 +10239,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
@@ -10242,7 +10250,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -14307,17 +14314,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s46, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[44:45], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[44:45]
 ; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index c71d080b066b0..5fab0c50bbe57 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -565,23 +565,22 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ;
 ; GFX11-LABEL: srem32_invariant_denom:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_abs_i32 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_abs_i32 s2, s0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_sub_i32 s3, 0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_i32 s3, s3, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_hi_u32 s5, s4, s3
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_add_i32 s4, s4, s5
@@ -602,6 +601,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, 4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, 0
@@ -694,32 +694,31 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB4_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    s_and_b32 s3, 0xffff, s2
 ; GFX11-NEXT:    s_add_i32 s2, s2, 1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_cmpk_eq_i32 s3, 0x400
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    s_cmpk_eq_i32 s3, 0x400
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f32 v2, -v1, v0, v2
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    global_store_b16 v3, v1, s[0:1]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT:    global_store_b16 v4, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB4_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
@@ -813,34 +812,33 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB5_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s3
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s4
 ; GFX11-NEXT:    s_lshl_b32 s5, s4, 1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX11-NEXT:    v_fma_f32 v2, -v1, v0, v2
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_lo_u32 v1, v1, s2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v3, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
 ; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX11-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB5_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
@@ -942,37 +940,38 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sext_i32_i16 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB6_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    s_sext_i32_i16 s4, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX11-NEXT:    s_xor_b32 s4, s4, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 s4, s4, 1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX11-NEXT:    v_fma_f32 v2, -v1, v0, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s5, |v2|, |v0|
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_and_b32 s5, s5, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s4, s4, 0
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT:    s_add_i32 s3, s3, 1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, s4, v2
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_add_nc_u32 v1, s4, v1
+; GFX11-NEXT:    s_add_i32 s3, s3, 1
+; GFX11-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX11-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB6_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
@@ -1078,42 +1077,42 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sext_i32_i16 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    s_sext_i32_i16 s4, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX11-NEXT:    s_xor_b32 s5, s4, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_ashr_i32 s5, s5, 30
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 s5, s5, 1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX11-NEXT:    v_fma_f32 v2, -v1, v0, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v2|, |v0|
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, s5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, s5, v2
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
 ; GFX11-NEXT:    s_add_i32 s3, s3, 1
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
-; GFX11-NEXT:    v_mul_lo_u32 v1, v1, s2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, s5
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
 ; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX11-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB7_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index 0317df6f6a044..f72f1e52d135f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -463,7 +463,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-LABEL: divergent_value:
 ; GFX1132DAGISEL:       ; %bb.0: ; %entry
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
 ; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
@@ -476,7 +476,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
-; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132DAGISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 5ec0224898bfd..4551c60770bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -464,7 +464,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1132DAGISEL-LABEL: divergent_value:
 ; GFX1132DAGISEL:       ; %bb.0: ; %entry
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
@@ -477,7 +477,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
-; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132DAGISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index e16ac07236e30..eb2d95e4db2d5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -38,15 +38,14 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %ad
 ; CHECK-LABEL: struct_atomic_buffer_load_i32_const_idx:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:  .LBB1_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v1, 15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
 ; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index cc34020e0a28a..bc50b12b59049 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -38,15 +38,14 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrs
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:  .LBB1_1: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v1, 15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
 ; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 506753f8640e1..dd77e575e7505 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -100,7 +100,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
 ; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
 ; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:  .LBB2_2: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_wait_kmcnt 0x0
@@ -110,9 +110,9 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GCN-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
-; GCN-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GCN-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
+; GCN-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
+; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  .LBB2_3: ; %for.end
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
index 3c15201cd698e..557ca065f6129 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
@@ -1,21 +1,25 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s
 
 --- |
-  ; User-requested maximum number of VGPRs need to be taken into account by
-  ; the scheduler's rematerialization stage. Register usage above that number
-  ; is considered like spill; occupancy is "inadvertently" increased when
-  ; eliminating spill.
   define void @small_num_vgprs_as_spill() "amdgpu-num-vgpr"="28" {
     ret void
   }
-
-  ; Min/Max occupancy is 8, the scheduler's rematerialization stage should not
-  ; try to rematerialize instructions.
+  define void @dont_remat_waves_per_eu() "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-waves-per-eu"="7,7" {
+    ret void
+  }
   define void @dont_remat_at_max_occ() "amdgpu-flat-work-group-size"="1024,1024" {
     ret void
   }
+  define void @reduce_arch_and_acc_vgrp_spill() "amdgpu-flat-work-group-size"="1024,1024" {
+    ret void
+  }
 ---
+# User-requested maximum number of VGPRs need to be taken into account by
+# the scheduler's rematerialization stage. Register usage above that number
+# is considered like spill; occupancy is "inadvertently" increased when
+# eliminating spill.
 name:            small_num_vgprs_as_spill
 tracksRegLiveness: true
 machineFunctionInfo:
@@ -69,6 +73,55 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_27]]
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX90A-LABEL: name: small_num_vgprs_as_spill
+  ; GFX90A: bb.0:
+  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.1:
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]]
+  ; GFX90A-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1
 
@@ -119,6 +172,172 @@ body:             |
 
     S_ENDPGM 0
 ...
+# Min/Max occupancy is 8, but user requests 7, the scheduler's rematerialization
+# stage should not try to rematerialize instructions.
+---
+name:            dont_remat_waves_per_eu
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: dont_remat_waves_per_eu
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX90A-LABEL: name: dont_remat_waves_per_eu
+  ; GFX90A: bb.0:
+  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.1:
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]]
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode,
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode,
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode,
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode,
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode,
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode,
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode,
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode,
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode,
+    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode,
+    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode,
+    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode,
+
+  bb.1:
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34
+    S_NOP 0, implicit %35
+
+    S_ENDPGM 0
+...
+# Min/Max occupancy is 8, the scheduler's rematerialization stage should not
+# try to rematerialize instructions.
 ---
 name:            dont_remat_at_max_occ
 tracksRegLiveness: true
@@ -171,6 +390,53 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX90A-LABEL: name: dont_remat_at_max_occ
+  ; GFX90A: bb.0:
+  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.1:
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX90A-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1
 
@@ -219,3 +485,278 @@ body:             |
 
     S_ENDPGM 0
 ...
+# Min/Max occupancy is 8. For targets with unified RF we are able to eliminate
+# VGPR spilling w.r.t. minimum workgroup size induced occupancy. We can only
+# eliminate ArchVGPR spilling in the non-unified RF case, which requires less
+# remats than in the unified case. 
+---
+name:            reduce_arch_and_acc_vgrp_spill
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: reduce_arch_and_acc_vgrp_spill
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF32]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX90A-LABEL: name: reduce_arch_and_acc_vgrp_spill
+  ; GFX90A: bb.0:
+  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.1:
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF32]]
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+    %32:agpr_32 = IMPLICIT_DEF
+    %33:agpr_32 = IMPLICIT_DEF
+    %34:agpr_32 = IMPLICIT_DEF
+    %35:agpr_32 = IMPLICIT_DEF
+    %36:agpr_32 = IMPLICIT_DEF
+    %37:agpr_32 = IMPLICIT_DEF
+    %38:agpr_32 = IMPLICIT_DEF
+    %39:agpr_32 = IMPLICIT_DEF
+    %40:agpr_32 = IMPLICIT_DEF
+    %41:agpr_32 = IMPLICIT_DEF
+    %42:agpr_32 = IMPLICIT_DEF
+    %43:agpr_32 = IMPLICIT_DEF
+    %44:agpr_32 = IMPLICIT_DEF
+    %45:agpr_32 = IMPLICIT_DEF
+    %46:agpr_32 = IMPLICIT_DEF
+    %47:agpr_32 = IMPLICIT_DEF
+    %48:agpr_32 = IMPLICIT_DEF
+    %49:agpr_32 = IMPLICIT_DEF
+    %50:agpr_32 = IMPLICIT_DEF
+    %51:agpr_32 = IMPLICIT_DEF
+    %52:agpr_32 = IMPLICIT_DEF
+    %53:agpr_32 = IMPLICIT_DEF
+    %54:agpr_32 = IMPLICIT_DEF
+    %55:agpr_32 = IMPLICIT_DEF
+    %56:agpr_32 = IMPLICIT_DEF
+    %57:agpr_32 = IMPLICIT_DEF
+    %58:agpr_32 = IMPLICIT_DEF
+    %59:agpr_32 = IMPLICIT_DEF
+    %60:agpr_32 = IMPLICIT_DEF
+    %61:agpr_32 = IMPLICIT_DEF
+    %62:agpr_32 = IMPLICIT_DEF
+    %63:agpr_32 = IMPLICIT_DEF
+
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
+    %65:agpr_32 = IMPLICIT_DEF
+
+  bb.1:
+
+    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
+    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34
+    S_NOP 0, implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
+    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44
+    S_NOP 0, implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
+    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54
+    S_NOP 0, implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
+    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64
+    S_NOP 0, implicit %65
+
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index bf1ab9a07d5f6..d62f045674ace 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -54,6 +54,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ;
 ; GFX12-LABEL: _amdgpu_cs_main:
 ; GFX12:       ; %bb.0: ; %branch1_true
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_mov_b32 s4, 0
@@ -76,7 +77,6 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX12-NEXT:    s_cbranch_execz .LBB0_1
 ; GFX12-NEXT:  ; %bb.3: ; %branch2_merge
 ; GFX12-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_mov_b32 s5, s4
 ; GFX12-NEXT:    s_mov_b32 s6, s4
 ; GFX12-NEXT:    s_mov_b32 s7, s4
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index ffa0cf5b9b3e0..538ce15979de8 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x22
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
-; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      multiple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x24
 ; CHECK-NEXT:        .lds_size:       0

>From cfc61472ddc6c365a1d2ad40c6c8cac6732e9025 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Mon, 17 Feb 2025 12:45:11 +0000
Subject: [PATCH 6/8] == -> >= for achieved occupancy check at the end of the
 stage

This prevents rollbacking rematerializations in case rescheduling
improved occupancy even further than the target.

Also fix format.
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h       | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 8f5b96fb788d8..6ae7840f11595 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2132,7 +2132,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
   // which case we do not want to rollback either (the rescheduling was already
   // reverted in PreRARematStage::shouldRevertScheduling in such cases).
   unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
-  if (!IncreaseOccupancy || MaxOcc == TargetOcc)
+  if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
     return;
 
   REMAT_DEBUG(dbgs() << "Rollbacking all rematerializations\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 92c6de8f3d7ad..f87a9038b95ca 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1558,8 +1558,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// \returns max num VGPRs. This is the common utility function
   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
-  unsigned getBaseMaxNumVGPRs(const Function &F,
-                              std::pair<unsigned, unsigned> NumVGPRBounds) const;
+  unsigned
+  getBaseMaxNumVGPRs(const Function &F,
+                     std::pair<unsigned, unsigned> NumVGPRBounds) const;
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p F, or number of VGPRs explicitly

>From 1216b5d8328d08366d34cf6c457691421cc68812 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 20 Feb 2025 00:20:22 +0000
Subject: [PATCH 7/8] Address feedback

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 15 +++++--
 ...ine-scheduler-sink-trivial-remats-attr.mir | 24 +++++------
 .../machine-scheduler-sink-trivial-remats.mir | 42 +++++++++++++------
 3 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 6ae7840f11595..0fb4c77fa8ad4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1690,6 +1690,7 @@ bool PreRARematStage::hasExcessVGPRs(const GCNRegPressure &RP,
       HasSpill = true;
     }
     if (NumAGPRs > MaxVGPRs) {
+      ExcessArchVGPRs = NumArchVGPRs;
       AGPRLimited = true;
       HasSpill = true;
     }
@@ -1747,7 +1748,8 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const unsigned MaxVGPRsNoSpill =
       ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(OccBounds.second),
                                 ST.getMaxNumVGPRs(OccBounds.first)});
-  const unsigned MaxSGPRsMinOcc = ST.getMaxNumSGPRs(DAG.MinOccupancy, false);
+  const unsigned MaxSGPRsIncOcc =
+      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
   const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
   IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy;
 
@@ -1784,13 +1786,20 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
       });
     } else if (IncreaseOccupancy) {
       // Check whether SGPR pressure prevents us from increasing occupancy.
-      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsMinOcc))
+      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
+        if (DAG.MinOccupancy >= OccBounds.first)
+          return false;
         continue;
+      }
 
       if (hasExcessVGPRs(RP, MaxVGPRsIncOcc, ExcessRP, OccAGPRLimited)) {
         // Check whether AGPR pressure prevents us from increasing occupancy.
-        if (ClearOptRegionsIf(OccAGPRLimited))
+        if (ClearOptRegionsIf(OccAGPRLimited)) {
+          if (DAG.MinOccupancy >= OccBounds.first)
+            return false;
           continue;
+        }
+
         // Occupancy could be increased by rematerializing ArchVGPRs.
         REMAT_DEBUG({
           if (ExcessRP) {
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
index 557ca065f6129..dd190b70c25e5 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
@@ -486,9 +486,9 @@ body:             |
     S_ENDPGM 0
 ...
 # Min/Max occupancy is 8. For targets with unified RF we are able to eliminate
-# VGPR spilling w.r.t. minimum workgroup size induced occupancy. We can only
-# eliminate ArchVGPR spilling in the non-unified RF case, which requires less
-# remats than in the unified case. 
+# VGPR spilling w.r.t. minimum workgroup size induced occupancy using just
+# enough remats. In the non-unified RF case we remat all eligible ArchVGPRs to
+# try to eliminate both ArchVGPR and AGPR spilling.
 ---
 name:            reduce_arch_and_acc_vgrp_spill
 tracksRegLiveness: true
@@ -526,11 +526,6 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
@@ -568,18 +563,23 @@ body:             |
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_32]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF32]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index ab7f8d544ddd2..f9413eed7521c 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -1168,21 +1168,26 @@ body:             |
   ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
   ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
   ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -1196,14 +1201,13 @@ body:             |
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.3:
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
   ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
   ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
@@ -1228,7 +1232,11 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
@@ -1307,7 +1315,12 @@ body:             |
     %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
     %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
 
     %100:sgpr_32 = S_MOV_B32 0
     %101:sgpr_32 = S_MOV_B32 1
@@ -1430,7 +1443,10 @@ body:             |
     S_NOP 0, implicit %30, implicit %31
     S_NOP 0, implicit %32, implicit %33
     S_NOP 0, implicit %34, implicit %35
-    S_NOP 0, implicit %36
+    S_NOP 0, implicit %36, implicit %37
+    S_NOP 0, implicit %38, implicit %39
+    S_NOP 0, implicit %40, implicit %41
+    S_NOP 0, implicit %42
 
     S_NOP 0, implicit %100, implicit %101
     S_NOP 0, implicit %102, implicit %103

>From a1f1e4de7be2a0a5a6b985be8c1f3ff18007113a Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 27 Feb 2025 12:09:43 +0000
Subject: [PATCH 8/8] Incorporate machine scheduler change from main

Now that single-MI regions are no longer filtered out by the machine
scheduler, we can greatly simplify the logic to update region boundaries
during rematerialization/rollbacking, at the cost of a single walk
through the MF's instructions at the beginning of the stage.
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 96 ++++++++++-----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   | 17 ++--
 2 files changed, 54 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0fb4c77fa8ad4..7a4f08aef0453 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1099,6 +1099,13 @@ bool PreRARematStage::initGCNSchedStage() {
       DAG.Regions.size() == 1 || !canIncreaseOccupancyOrReduceSpill())
     return false;
 
+  // Map all MIs to their parent region.
+  for (unsigned I = 0, E = DAG.Regions.size(); I < E; ++I) {
+    RegionBoundaries Region = DAG.Regions[I];
+    for (auto MI = Region.first; MI != Region.second; ++MI)
+      MIRegion.insert({&*MI, I});
+  }
+
   // Rematerialize identified instructions and update scheduler's state.
   rematerialize();
   if (GCNTrackers)
@@ -1841,7 +1848,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   // Cache set of registers that are going to be rematerialized.
   DenseSet<unsigned> RematRegs;
 
-  // identify rematerializable instructions in the function.
+  // Identify rematerializable instructions in the function.
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
     auto Region = DAG.Regions[I];
     for (auto MI = Region.first; MI != Region.second; ++MI) {
@@ -1885,7 +1892,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
 
       REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
       RematInstruction &Remat =
-          Rematerializations.try_emplace(&DefMI, I, UseMI).first->second;
+          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
 
       bool RematUseful = false;
       if (auto It = OptRegions.find(I); It != OptRegions.end()) {
@@ -1957,6 +1964,7 @@ void PreRARematStage::rematerialize() {
     MachineBasicBlock::iterator InsertPos(Remat.UseMI);
     Register Reg = DefMI->getOperand(0).getReg();
     unsigned SubReg = DefMI->getOperand(0).getSubReg();
+    unsigned DefRegion = MIRegion.at(DefMI);
 
     // Rematerialize DefMI to its use block.
     TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
@@ -1968,8 +1976,12 @@ void PreRARematStage::rematerialize() {
     // Update region boundaries in regions we sinked from (remove defining MI)
     // and to (insert MI rematerialized in use block). Only then we can erase
     // the original MI.
-    DAG.updateRegionBoundaries(DAG.Regions, DefMI, nullptr);
-    DAG.updateRegionBoundaries(DAG.Regions, InsertPos, Remat.RematMI);
+    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
+    auto UseRegion = MIRegion.find(Remat.UseMI);
+    if (UseRegion != MIRegion.end()) {
+      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
+                                 Remat.RematMI);
+    }
     DefMI->eraseFromParent();
     DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
 
@@ -2024,8 +2036,8 @@ void PreRARematStage::rematerialize() {
     }
     // RP in the region from which the instruction was rematerialized may or may
     // not decrease.
-    ImpactedRegions.insert({Remat.DefRegion, DAG.Pressure[Remat.DefRegion]});
-    RecomputeRP.insert(Remat.DefRegion);
+    ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
+    RecomputeRP.insert(DefRegion);
 
     // Update the register's live interval manually. This is aligned with the
     // instruction collection phase in that it assumes a single def and use
@@ -2149,10 +2161,11 @@ void PreRARematStage::finalizeGCNSchedStage() {
       static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   // Rollback the rematerializations.
-  for (const auto &[_, Remat] : Rematerializations) {
+  for (const auto &[DefMI, Remat] : Rematerializations) {
     MachineInstr &RematMI = *Remat.RematMI;
-    MachineBasicBlock::iterator InsertPos(DAG.Regions[Remat.DefRegion].second);
-    MachineBasicBlock *MBB = getRegionMBB(MF, DAG.Regions[Remat.DefRegion]);
+    unsigned DefRegion = MIRegion.at(DefMI);
+    MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
+    MachineBasicBlock *MBB = getRegionMBB(MF, DAG.Regions[DefRegion]);
     Register Reg = RematMI.getOperand(0).getReg();
     unsigned SubReg = RematMI.getOperand(0).getSubReg();
 
@@ -2164,6 +2177,13 @@ void PreRARematStage::finalizeGCNSchedStage() {
     NewMI->getOperand(0).setSubReg(SubReg);
     DAG.LIS->InsertMachineInstrInMaps(*NewMI);
 
+    auto UseRegion = MIRegion.find(Remat.UseMI);
+    if (UseRegion != MIRegion.end()) {
+      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
+                                 nullptr);
+    }
+    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
+
     // Erase rematerialized MI.
     RematMI.eraseFromParent();
     DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
@@ -2185,49 +2205,25 @@ void PreRARematStage::finalizeGCNSchedStage() {
 }
 
 void GCNScheduleDAGMILive::updateRegionBoundaries(
-    SmallVectorImpl<RegionBoundaries> &RegionBoundaries,
-    MachineBasicBlock::iterator MI, MachineInstr *NewMI) {
-  unsigned I = 0, E = RegionBoundaries.size();
-  // Search for first region of the block where MI is located. We may encounter
-  // an empty region if all instructions from an initially non-empty region were
-  // removed.
-  while (I != E && RegionBoundaries[I].first != RegionBoundaries[I].second &&
-         MI->getParent() != RegionBoundaries[I].first->getParent())
-    ++I;
-
-  for (; I != E; ++I) {
-    auto &Bounds = RegionBoundaries[I];
-    assert(MI != Bounds.second && "cannot insert at region end");
-    assert(!NewMI || NewMI != Bounds.second && "cannot remove at region end");
-
-    // We may encounter an empty region if all of the region' instructions were
-    // previously removed.
-    if (Bounds.first == Bounds.second) {
-      if (MI->getParent()->end() != Bounds.second)
-        return;
-      continue;
-    }
-    if (MI->getParent() != Bounds.first->getParent())
-      return;
-
-    // We only care for modifications at the beginning of the region since the
-    // upper region boundary is exclusive.
-    if (MI != Bounds.first)
-      continue;
-    if (!NewMI) {
-      // This is an MI removal, which may leave the region empty; in such cases
-      // set both boundaries to the removed instruction's MBB's end.
-      MachineBasicBlock::iterator NextMI = std::next(MI);
-      if (NextMI != Bounds.second)
-        Bounds.first = NextMI;
-      else
-        Bounds.first = Bounds.second;
-    } else {
-      // This is an MI insertion at the beggining of the region.
-      Bounds.first = NewMI;
-    }
+    RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
+    MachineInstr *NewMI) {
+  assert(!NewMI ||
+         NewMI != RegionBounds.second && "cannot remove at region end");
+
+  if (RegionBounds.first == RegionBounds.second) {
+    assert(NewMI && "cannot remove from an empty region");
+    RegionBounds.first = NewMI;
     return;
   }
+
+  // We only care for modifications at the beginning of a non-empty region since
+  // the upper region boundary is exclusive.
+  if (MI != RegionBounds.first)
+    return;
+  if (!NewMI)
+    RegionBounds.first = std::next(MI); // Removal
+  else
+    RegionBounds.first = NewMI; // Insertion
 }
 
 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 9fd40d8e5e64f..e8a169072dee4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -295,11 +295,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   /// If necessary, updates a region's boundaries following insertion ( \p NewMI
   /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
   /// For an MI removal, this must be called before the MI is actually erased
-  /// from its parent MBB. If a region is left empty by a removal, both
-  /// boundaries are set to the last removed MI's MBB's end.
-  void
-  updateRegionBoundaries(SmallVectorImpl<RegionBoundaries> &RegionBoundaries,
-                         MachineBasicBlock::iterator MI, MachineInstr *NewMI);
+  /// from its parent MBB.
+  void updateRegionBoundaries(RegionBoundaries &RegionBounds,
+                              MachineBasicBlock::iterator MI,
+                              MachineInstr *NewMI);
 
   void runSchedStages();
 
@@ -460,13 +459,13 @@ class PreRARematStage : public GCNSchedStage {
     /// Set of regions in which the rematerializable instruction's defined
     /// register is a live-in.
     SmallDenseSet<unsigned, 4> LiveInRegions;
-    /// Region containing the rematerializable instruction.
-    unsigned DefRegion;
 
-    RematInstruction(unsigned DefRegion, MachineInstr *UseMI)
-        : UseMI(UseMI), DefRegion(DefRegion) {}
+    RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
   };
 
+  /// Maps all MIs to their parent region. MI terminators are considered to be
+  /// outside the region they delimitate, and as such are not stored in the map.
+  DenseMap<MachineInstr *, unsigned> MIRegion;
   /// Collects instructions to rematerialize.
   MapVector<MachineInstr *, RematInstruction> Rematerializations;
   /// Collect regions whose live-ins or register pressure will change due to



More information about the llvm-commits mailing list