[llvm] [AMDGPU][Scheduler] Support for rematerializing SGPRs and AGPRs (PR #140036)

Thu May 15 03:15:27 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Lucas Ramirez (lucas-rami)

<details>
<summary>Changes</summary>

This adds the ability to rematerialize SGPRs and AGPRs to the scheduler's `PreRARematStage`, which can currently only rematerialize ArchVGPRs. This also fixes a small potential issue in the stage where, in case of spilling, the target occupancy could be set to a lower than expected value when the function had either one of the "amdgpu-num-sgpr" or "amdgpu-num-vgpr" attributes set.

---

Patch is 95.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140036.diff


4 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+59-64) 
- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.h (-2) 
- (modified) llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir (+402-72) 
- (modified) llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir (+183-183) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a9c891a1f1dd1..6f6abf5a20930 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1702,6 +1702,8 @@ namespace {
 /// Models excess register pressure in a region and tracks our progress as we
 /// identify rematerialization opportunities.
 struct ExcessRP {
+  /// Number of excess SGPRs.
+  unsigned SGPRs = 0;
   /// Number of excess ArchVGPRs.
   unsigned ArchVGPRs = 0;
   /// Number of excess AGPRs.
@@ -1717,8 +1719,13 @@ struct ExcessRP {
   bool UnifiedRF;
 
   /// Constructs the excess RP model; determines the excess pressure w.r.t. a
-  /// maximum number of allowed VGPRs.
-  ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
+  /// maximum number of allowed SGPRs/VGPRs.
+  ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxSGPRs,
+           unsigned MaxVGPRs);
+
+  /// Accounts for \p NumRegs saved SGPRs in the model. Returns whether saving
+  /// these SGPRs helped reduce excess pressure.
+  bool saveSGPRs(unsigned NumRegs) { return saveRegs(SGPRs, NumRegs); }
 
   /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
   /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
@@ -1726,17 +1733,20 @@ struct ExcessRP {
   /// saving these ArchVGPRs helped reduce excess pressure.
   bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
 
-  /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
-  /// these ArchVGPRs helped reduce excess pressure.
-  bool saveAGPRs(unsigned NumRegs);
+  /// Accounts for \p NumRegs saved AGPRs in the model. Returns whether saving
+  /// these AGPRs helped reduce excess pressure.
+  bool saveAGPRs(unsigned NumRegs) {
+    return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
+  }
 
   /// Returns whether there is any excess register pressure.
-  operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
+  operator bool() const { return SGPRs || ArchVGPRs || AGPRs || VGPRs; }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
-    OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
-       << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
+    OS << Excess.SGPRs << " SGPRs, " << Excess.ArchVGPRs << " ArchVGPRs, and "
+       << Excess.AGPRs << " AGPRs, (" << Excess.VGPRs
+       << " VGPRs in total, next ArchVGPR aligment in "
        << Excess.ArchVGPRsToAlignment << " registers)\n";
     return OS;
   }
@@ -1753,12 +1763,17 @@ struct ExcessRP {
 } // namespace
 
 ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
-                   unsigned MaxVGPRs)
+                   unsigned MaxSGPRs, unsigned MaxVGPRs)
     : UnifiedRF(ST.hasGFX90AInsts()) {
+  // Compute excess SGPR pressure.
+  unsigned NumSGPRs = RP.getSGPRNum();
+  if (NumSGPRs > MaxSGPRs)
+    SGPRs = NumSGPRs - MaxSGPRs;
+
+  // Compute excess ArchVGPR/AGPR pressure.
   unsigned NumArchVGPRs = RP.getArchVGPRNum();
   unsigned NumAGPRs = RP.getAGPRNum();
   HasAGPRs = NumAGPRs;
-
   if (!UnifiedRF) {
     // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
     // independently.
@@ -1839,10 +1854,6 @@ bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
   return Progress;
 }
 
-bool ExcessRP::saveAGPRs(unsigned NumRegs) {
-  return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
-}
-
 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
 
@@ -1865,46 +1876,19 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
   IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
 
-  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
-    if (Cond) {
-      // We won't try to increase occupancy.
-      IncreaseOccupancy = false;
-      OptRegions.clear();
-    }
-    return Cond;
-  };
-
   // Collect optimizable regions. If there is spilling in any region we will
-  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
-  // occupancy by one in the whole function.
+  // just try to reduce spilling. Otherwise we will try to increase occupancy by
+  // one in the whole function.
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
     GCNRegPressure &RP = DAG.Pressure[I];
-
-    // Check whether SGPR pressures prevents us from eliminating spilling.
-    unsigned NumSGPRs = RP.getSGPRNum();
-    if (NumSGPRs > MaxSGPRsNoSpill)
-      ClearOptRegionsIf(IncreaseOccupancy);
-
-    ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
-    if (Excess) {
-      ClearOptRegionsIf(IncreaseOccupancy);
+    ExcessRP Excess(ST, RP, MaxSGPRsNoSpill, MaxVGPRsNoSpill);
+    if (Excess && IncreaseOccupancy) {
+      // There is spilling in the region and we were so far trying to increase
+      // occupancy. Strop trying that and focus on reducing spilling.
+      IncreaseOccupancy = false;
+      OptRegions.clear();
     } else if (IncreaseOccupancy) {
-      // Check whether SGPR pressure prevents us from increasing occupancy.
-      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
-        if (DAG.MinOccupancy >= WavesPerEU.first)
-          return false;
-        continue;
-      }
-      if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
-        // We can only rematerialize ArchVGPRs at this point.
-        unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
-        bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
-        if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
-          if (DAG.MinOccupancy >= WavesPerEU.first)
-            return false;
-          continue;
-        }
-      }
+      Excess = ExcessRP(ST, RP, MaxSGPRsIncOcc, MaxVGPRsIncOcc);
     }
     if (Excess)
       OptRegions.insert({I, Excess});
@@ -1924,23 +1908,34 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
 #endif
 
   // When we are reducing spilling, the target is the minimum target number of
-  // waves/EU determined by the subtarget.
-  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
+  // waves/EU determined by the subtarget. In cases where either one of
+  // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
+  // minimum region occupancy may be higher than the latter.
+  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
+                                : std::max(DAG.MinOccupancy, WavesPerEU.first);
 
   // Accounts for a reduction in RP in an optimizable region. Returns whether we
   // estimate that we have identified enough rematerialization opportunities to
   // achieve our goal, and sets Progress to true when this particular reduction
   // in pressure was helpful toward that goal.
   auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
+                              const TargetRegisterClass *RC,
                               bool &Progress) -> bool {
     ExcessRP &Excess = OptIt->getSecond();
-    // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
-    // only when we are just trying to eliminate spilling to memory. At this
-    // point we err on the conservative side and do not increase
-    // register-to-register spilling for the sake of increasing occupancy.
-    Progress |=
-        Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
-                             /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
+    unsigned NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+    if (SRI->isSGPRClass(RC)) {
+      Progress |= Excess.saveSGPRs(NumRegs);
+    } else if (SRI->isAGPRClass(RC)) {
+      Progress |= Excess.saveAGPRs(NumRegs);
+    } else {
+      // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
+      // only when we are just trying to eliminate spilling to memory. At this
+      // point we err on the conservative side and do not increase
+      // register-to-register spilling for the sake of increasing occupancy.
+      Progress |=
+          Excess.saveArchVGPRs(NumRegs,
+                               /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
+    }
     if (!Excess)
       OptRegions.erase(OptIt->getFirst());
     return OptRegions.empty();
@@ -1962,10 +1957,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
       if (!isTriviallyReMaterializable(DefMI))
         continue;
 
-      // We only support rematerializing virtual VGPRs with one definition.
+      // We only support rematerializing virtual registers with one definition.
       Register Reg = DefMI.getOperand(0).getReg();
-      if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
-          !DAG.MRI.hasOneDef(Reg))
+      if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
         continue;
 
       // We only care to rematerialize the instruction if it has a single
@@ -2003,6 +1997,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
           Rematerializations.try_emplace(&DefMI, UseMI).first->second;
 
       bool RematUseful = false;
+      const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg);
       if (auto It = OptRegions.find(I); It != OptRegions.end()) {
         // Optimistically consider that moving the instruction out of its
         // defining region will reduce RP in the latter; this assumes that
@@ -2010,7 +2005,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
         // instruction and the end of the region.
         REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
         LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
-        if (ReduceRPInRegion(It, Mask, RematUseful))
+        if (ReduceRPInRegion(It, Mask, RC, RematUseful))
           return true;
       }
 
@@ -2030,7 +2025,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
         // instruction's use.
         if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
           REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
-          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
+          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RC, RematUseful))
             return true;
         }
       }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index ca4ab4a2c560f..e20ad34797541 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -444,8 +444,6 @@ class ClusteredLowOccStage : public GCNSchedStage {
 /// estimates reducing spilling or increasing occupancy is possible, as few
 /// instructions as possible are rematerialized to reduce potential negative
 /// effects on function latency.
-///
-/// TODO: We should extend this to work on SGPRs and AGPRs as well.
 class PreRARematStage : public GCNSchedStage {
 private:
   /// Useful information about a rematerializable instruction.
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
index f5558964d2707..ba42f9f1d1860 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
@@ -3,6 +3,9 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s
 
 --- |
+  define void @small_num_sgprs_as_spill() "amdgpu-num-sgpr"="85" {
+    ret void
+  }
   define void @small_num_vgprs_as_spill() "amdgpu-num-vgpr"="28" {
     ret void
   }
@@ -15,13 +18,333 @@
   define void @reduce_arch_and_acc_vgrp_spill() "amdgpu-waves-per-eu"="8,8" {
     ret void
   }
-  define void @reduce_spill_archvgpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
+  define void @reduce_spill_archvgpr_above_addressable_limit() "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,2" {
     ret void
   }
-  define void @reduce_spill_agpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
+  define void @reduce_spill_agpr_above_addressable_limit() "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,2" {
     ret void
   }
 ---
+# User-requested maximum number of SGPRs need to be taken into account by
+# the scheduler's rematerialization stage. Register usage above that number
+# is considered like spill.
+name:            small_num_sgprs_as_spill
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: small_num_sgprs_as_spill
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
+  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
+  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
+  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
+  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
+  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
+  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
+  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
+  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
+  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
+  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
+  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
+  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
+  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
+  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
+  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
+  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
+  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
+  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
+  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
+  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
+  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
+  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
+  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
+  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
+  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
+  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
+  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
+  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
+  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
+  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
+  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
+  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
+  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
+  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
+  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
+  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
+  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
+  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
+  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
+  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
+  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
+  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
+  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
+  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
+  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
+  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
+  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
+  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
+  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
+  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
+  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
+  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
+  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
+  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
+  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
+  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
+  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
+  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
+  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
+  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
+  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
+  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
+  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
+  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
+  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
+  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
+  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_14]], implicit [[S_MOV_B32_15]], implicit [[S_MOV_B32_16]], implicit [[S_MOV_B32_17]], implicit [[S_MOV_B32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_19]], implicit [[S_MOV_B32_20]], implicit [[S_MOV_B32_21]], implicit [[S_MOV_B32_22]], implicit [[S_MOV_B32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_24]], implicit [[S_MOV_B32_25]], implicit [[S_MOV_B32_26]], implicit [[S_MOV_B32_27]], implicit [[S_MOV_B32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_29]], implicit [[S_MOV_B32_30]], implicit [[S_MOV_B32_31]], implicit [[S_MOV_B32_32]], implicit [[S_MOV_B32_33]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_34]], implicit [[S_MOV_B32_35]], implicit [[S_MOV_B32_36]], implicit [[S_MOV_B32_37]], implicit [[S_MOV_B32_38]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_39]], implicit [[S_MOV_B32_40]], implicit [[S_MOV_B32_41]], implicit [[S_MOV_B32_42]], implicit [[S_MOV_B32_43]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_44]], implicit [[S_MOV_B32_45]], implicit [[S_MOV_B32_46]], implicit [[S_MOV_B32_47]], implicit [[S_MOV_B32_48]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_49]], implicit [[S_MOV_B32_50]], implicit [[S_MOV_B32_51]], implicit [[S_MOV_B32_52]], implicit [[S_MOV_B32_53]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_54]], implicit [[S_MOV_B32_55]], implicit [[S_MOV_B32_56]], implicit [[S_MOV_B32_57]], implicit [[S_MOV_B32_58]]
+  ; GFX908-NEX...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/140036