[llvm] [AMDGPU] Prefer lower total register usage in regions with spilling (PR #71882)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 28 16:18:26 PST 2023


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/71882

>From c2169688ad3ce86bdad2c6cf613de61e67b77aba Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 28 Nov 2023 11:27:46 -0800
Subject: [PATCH 1/2] [AMDGPU] Improve ExcessRP flagging

Change-Id: I4b1203e745928786d4fc7870a4e0aad844113285
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  81 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  38 +-
 .../AMDGPU/gfx-callable-return-types.ll       | 702 +++++++++---------
 3 files changed, 427 insertions(+), 394 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c3d60b635d3240a..7e44b970c690d15 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -55,7 +55,8 @@ const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
-      HasHighPressure(false) {}
+      HasHighPressure(false), HasExcessSGPRPressure(false),
+      HasExcessVGPRPressure(false) {}
 
 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
@@ -80,7 +81,7 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   SGPRCriticalLimit =
       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
 
-  if (!KnownExcessRP) {
+  if (!KnownExcessVGPR) {
     VGPRCriticalLimit =
         std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
   } else {
@@ -163,12 +164,14 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   // instructions that increase the register pressure.
   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
     HasHighPressure = true;
+    HasExcessVGPRPressure = true;
     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
   }
 
   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
     HasHighPressure = true;
+    HasExcessSGPRPressure = true;
     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
   }
@@ -617,12 +620,18 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
   Pressure.resize(Regions.size());
   RescheduleRegions.resize(Regions.size());
   RegionsWithHighRP.resize(Regions.size());
-  RegionsWithExcessRP.resize(Regions.size());
+  RegionsWithExcessSGPRBefore.resize(Regions.size());
+  RegionsWithExcessSGPRAfter.resize(Regions.size());
+  RegionsWithExcessVGPRBefore.resize(Regions.size());
+  RegionsWithExcessVGPRAfter.resize(Regions.size());
   RegionsWithMinOcc.resize(Regions.size());
   RegionsWithIGLPInstrs.resize(Regions.size());
   RescheduleRegions.set();
   RegionsWithHighRP.reset();
-  RegionsWithExcessRP.reset();
+  RegionsWithExcessSGPRBefore.reset();
+  RegionsWithExcessSGPRAfter.reset();
+  RegionsWithExcessVGPRBefore.reset();
+  RegionsWithExcessVGPRAfter.reset();
   RegionsWithMinOcc.reset();
   RegionsWithIGLPInstrs.reset();
 
@@ -650,7 +659,6 @@ void GCNScheduleDAGMILive::runSchedStages() {
         exitRegion();
         continue;
       }
-
       ScheduleDAGMILive::schedule();
       Stage->finalizeGCNRegion();
     }
@@ -691,6 +699,9 @@ bool GCNSchedStage::initGCNSchedStage() {
   if (!DAG.LIS)
     return false;
 
+  DAG.RegionsWithExcessSGPRAfter.reset();
+  DAG.RegionsWithExcessVGPRAfter.reset();
+
   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
   return true;
 }
@@ -702,7 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
   if (!GCNSchedStage::initGCNSchedStage())
     return false;
 
-  if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
+  if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessVGPRBefore.none() &&
+      DAG.RegionsWithExcessSGPRBefore.none())
     return false;
 
   SavedMutations.swap(DAG.Mutations);
@@ -837,7 +849,9 @@ bool GCNSchedStage::initGCNRegion() {
              << "Region register pressure: " << print(PressureBefore));
 
   S.HasHighPressure = false;
-  S.KnownExcessRP = isRegionWithExcessRP();
+  S.KnownExcessVGPR = DAG.RegionsWithExcessVGPRBefore[RegionIdx];
+  S.HasExcessSGPRPressure = false;
+  S.HasExcessVGPRPressure = false;
 
   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
@@ -854,7 +868,7 @@ bool UnclusteredHighRPStage::initGCNRegion() {
   // spilling (excess register pressure).
   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
        DAG.MinOccupancy <= InitialOccupancy) &&
-      !DAG.RegionsWithExcessRP[RegionIdx])
+      !isRegionWithExcessRP())
     return false;
 
   return GCNSchedStage::initGCNRegion();
@@ -888,8 +902,15 @@ void GCNSchedStage::setupNewBlock() {
   // Get real RP for the region if it hasn't be calculated before. After the
   // initial schedule stage real RP will be collected after scheduling.
   if (StageID == GCNSchedStageID::OccInitialSchedule ||
-      StageID == GCNSchedStageID::ILPInitialSchedule)
+      StageID == GCNSchedStageID::ILPInitialSchedule) {
     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
+
+    DAG.RegionsWithExcessSGPRBefore[RegionIdx] =
+        DAG.Pressure[RegionIdx].getSGPRNum() >= ST.getMaxNumSGPRs(MF);
+    DAG.RegionsWithExcessVGPRBefore[RegionIdx] =
+        DAG.Pressure[RegionIdx].getVGPRNum(ST.hasGFX90AInsts()) >=
+        ST.getMaxNumVGPRs(MF);
+  }
 }
 
 void GCNSchedStage::finalizeGCNRegion() {
@@ -910,6 +931,16 @@ void GCNSchedStage::finalizeGCNRegion() {
   RegionIdx++;
 }
 
+void GCNSchedStage::commitScheduling() {
+  DAG.Pressure[RegionIdx] = PressureAfter;
+  DAG.RegionsWithMinOcc[RegionIdx] =
+      PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+  DAG.RegionsWithExcessSGPRBefore[RegionIdx] =
+      DAG.RegionsWithExcessSGPRAfter[RegionIdx];
+  DAG.RegionsWithExcessVGPRBefore[RegionIdx] =
+      DAG.RegionsWithExcessVGPRAfter[RegionIdx];
+}
+
 void GCNSchedStage::checkScheduling() {
   // Check the results of scheduling.
   PressureAfter = DAG.getRealRegPressure(RegionIdx);
@@ -918,9 +949,7 @@ void GCNSchedStage::checkScheduling() {
 
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
-    DAG.Pressure[RegionIdx] = PressureAfter;
-    DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+    commitScheduling();
 
     // Early out if we have achieve the occupancy target.
     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@@ -962,22 +991,22 @@ void GCNSchedStage::checkScheduling() {
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
   if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
-      PressureAfter.getAGPRNum() > MaxVGPRs ||
-      PressureAfter.getSGPRNum() > MaxSGPRs) {
+      PressureAfter.getAGPRNum() > MaxVGPRs) {
+    DAG.RegionsWithExcessVGPRAfter[RegionIdx] = true;
     DAG.RescheduleRegions[RegionIdx] = true;
     DAG.RegionsWithHighRP[RegionIdx] = true;
-    DAG.RegionsWithExcessRP[RegionIdx] = true;
   }
 
-  // Revert if this region's schedule would cause a drop in occupancy or
-  // spilling.
-  if (shouldRevertScheduling(WavesAfter)) {
-    revertScheduling();
-  } else {
-    DAG.Pressure[RegionIdx] = PressureAfter;
-    DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+  if (PressureAfter.getSGPRNum() > MaxSGPRs) {
+    DAG.RegionsWithExcessSGPRAfter[RegionIdx] = true;
+    DAG.RescheduleRegions[RegionIdx] = true;
+    DAG.RegionsWithHighRP[RegionIdx] = true;
   }
+
+  // Revert if this region's schedule would cause a drop in occupancy or
+  // spilling. Otherwise, update the state of the scheduler to reflect
+  // that the found schedule is the best found so far.
+  shouldRevertScheduling(WavesAfter) ? revertScheduling() : commitScheduling();
 }
 
 unsigned
@@ -1125,7 +1154,8 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   }
 
   // Do not attempt to relax schedule even more if we are already spilling.
-  if (isRegionWithExcessRP())
+  if (DAG.RegionsWithExcessSGPRAfter[RegionIdx] ||
+      DAG.RegionsWithExcessVGPRAfter[RegionIdx])
     return false;
 
   LLVM_DEBUG(
@@ -1185,7 +1215,8 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
   if (WavesAfter <= MFI.getMinWavesPerEU() &&
       !PressureAfter.less(ST, PressureBefore) &&
-      isRegionWithExcessRP()) {
+      (DAG.RegionsWithExcessSGPRAfter[RegionIdx] ||
+       DAG.RegionsWithExcessVGPRAfter[RegionIdx])) {
     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 7862ec1e894b62e..e07715a4ac05f64 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -56,10 +56,6 @@ class GCNSchedStrategy : public GenericScheduler {
 
   std::vector<unsigned> MaxPressure;
 
-  unsigned SGPRExcessLimit;
-
-  unsigned VGPRExcessLimit;
-
   unsigned TargetOccupancy;
 
   MachineFunction *MF;
@@ -75,9 +71,14 @@ class GCNSchedStrategy : public GenericScheduler {
   // track register pressure for actual scheduling heuristics.
   bool HasHighPressure;
 
-  // Schedule known to have excess register pressure. Be more conservative in
-  // increasing ILP and preserving VGPRs.
-  bool KnownExcessRP = false;
+  // The scheduler has tested the limits of excess RP when considering
+  // candidates.
+  bool HasExcessSGPRPressure = false;
+  bool HasExcessVGPRPressure = false;
+
+  // Schedule known to have excess VGPR register pressure. Be more conservative
+  // in increasing ILP and preserving VGPRs.
+  bool KnownExcessVGPR = false;
 
   // An error margin is necessary because of poor performance of the generic RP
   // tracker and can be adjusted up for tuning heuristics to try and more
@@ -94,6 +95,10 @@ class GCNSchedStrategy : public GenericScheduler {
 
   unsigned VGPRCriticalLimit;
 
+  unsigned SGPRExcessLimit;
+
+  unsigned VGPRExcessLimit;
+
   unsigned SGPRLimitBias = 0;
 
   unsigned VGPRLimitBias = 0;
@@ -194,7 +199,18 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
 
   // Record regions with excess register pressure over the physical register
   // limit. Register pressure in these regions usually will result in spilling.
-  BitVector RegionsWithExcessRP;
+  // Track both before and after to enable better decision making w.r.t quality
+  // of schedules.
+
+  // Excess SGPR pressure before the scheduling stage.
+  BitVector RegionsWithExcessSGPRBefore;
+  // Excess SGPR pressure after the scheduling stage.
+  BitVector RegionsWithExcessSGPRAfter;
+
+  // Excess VGPR pressure before the scheduling stage.
+  BitVector RegionsWithExcessVGPRBefore;
+  // Excess VGPR pressure after the scheduling stage.
+  BitVector RegionsWithExcessVGPRAfter;
 
   // Regions that has the same occupancy as the latest MinOccupancy
   BitVector RegionsWithMinOcc;
@@ -308,7 +324,8 @@ class GCNSchedStage {
 
   // Returns true if current region has known excess pressure.
   bool isRegionWithExcessRP() const {
-    return DAG.RegionsWithExcessRP[RegionIdx];
+    return DAG.RegionsWithExcessSGPRBefore[RegionIdx] ||
+           DAG.RegionsWithExcessVGPRBefore[RegionIdx];
   }
 
   // Returns true if the new schedule may result in more spilling.
@@ -317,6 +334,9 @@ class GCNSchedStage {
   // Attempt to revert scheduling for this region.
   void revertScheduling();
 
+  // Commit the current schedule as the best found so far.
+  void commitScheduling();
+
   void advanceRegion() { RegionIdx++; }
 
   virtual ~GCNSchedStage() = default;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 0d54da3128a617a..3a6f01f15a4fee7 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -2504,281 +2504,260 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-LABEL: return_72xi32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_clause 0x14
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:136
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:140
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:144
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:148
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:152
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:156
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:160
-; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:76
 ; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
 ; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:84
 ; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:112
-; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:88
 ; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:108
-; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:92
 ; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
 ; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:36
 ; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:40
 ; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
 ; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
 ; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:52
 ; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:56
 ; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
 ; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:28
 ; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
 ; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:16
 ; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
 ; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:152
 ; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:148
 ; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:144
 ; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:140
 ; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136
 ; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132
 ; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128
 ; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(32)
-; GFX10-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen offset:284
-; GFX10-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen offset:280
-; GFX10-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:276
-; GFX10-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:272
-; GFX10-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:268
-; GFX10-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:264
-; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:260
-; GFX10-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:256
-; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:252
-; GFX10-NEXT:    s_waitcnt vmcnt(24)
-; GFX10-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen offset:248
-; GFX10-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen offset:244
-; GFX10-NEXT:    buffer_store_dword v42, v0, s[0:3], 0 offen offset:240
-; GFX10-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:236
-; GFX10-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:232
-; GFX10-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:228
-; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:224
-; GFX10-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:220
-; GFX10-NEXT:    s_waitcnt vmcnt(16)
-; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:216
-; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:212
-; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:208
-; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:204
-; GFX10-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:200
-; GFX10-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:196
-; GFX10-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:192
-; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:188
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:284
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:280
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:276
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:272
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:268
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:264
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:260
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:256
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:252
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:248
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:244
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:240
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:184
-; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:180
-; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:176
-; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:172
-; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:168
-; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:164
-; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:160
-; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:156
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:236
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:232
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:228
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:224
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:220
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:216
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:212
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:208
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:204
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:200
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:196
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:188
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
-; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:152
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:184
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:180
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:176
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:172
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:168
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:164
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:160
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:156
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:152
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:148
-; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:144
-; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
-; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:148
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:144
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:140
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:136
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:132
-; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:132
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:128
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:124
 ; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX10-NEXT:    s_clause 0x4
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:164
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x10
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:220
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:216
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:212
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:208
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:204
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:200
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:196
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:192
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:188
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:184
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:180
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:176
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:172
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:168
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:164
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s32 offset:224
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s32 offset:240
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:152
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
-; GFX11-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v26, v23
-; GFX11-NEXT:    v_dual_mov_b32 v25, v22 :: v_dual_mov_b32 v24, v21
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:148
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    v_dual_mov_b32 v23, v20 :: v_dual_mov_b32 v22, v19
-; GFX11-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v17
-; GFX11-NEXT:    v_dual_mov_b32 v19, v16 :: v_dual_mov_b32 v18, v15
-; GFX11-NEXT:    v_dual_mov_b32 v17, v14 :: v_dual_mov_b32 v16, v13
-; GFX11-NEXT:    v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v14, v11
-; GFX11-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v12, v9
-; GFX11-NEXT:    v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v10, v7
-; GFX11-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX11-NEXT:    s_add_i32 s2, s0, 64
+; GFX11-NEXT:    s_add_i32 s3, s0, 48
+; GFX11-NEXT:    s_add_i32 s34, s0, 32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:104
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s34, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s35, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s36, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s37, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s38, s0, 0x80
-; GFX11-NEXT:    s_add_i32 s39, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s40, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s41, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s42, s0, 64
-; GFX11-NEXT:    s_add_i32 s43, s0, 48
-; GFX11-NEXT:    s_add_i32 s44, s0, 32
-; GFX11-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-NEXT:    scratch_store_b128 off, v[45:48], s1
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:132
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x100
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:108
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s1
-; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:124
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v4, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s32 offset:224
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s0, s0, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s32 offset:224 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[59:62], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s34
-; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s35
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s36
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s37
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s38
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:224 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:100
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xe0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:84
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xd0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s39
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:240 ; 16-byte Folded Reload
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:68
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xc0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s40
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s41
-; GFX11-NEXT:    scratch_store_b128 off, v[20:23], s42
-; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s43
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s44
-; GFX11-NEXT:    scratch_store_b128 off, v[8:11], s0
-; GFX11-NEXT:    s_clause 0xe
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:220
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:52
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xb0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:36
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xa0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:20
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x90
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x80
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s1
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x60
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s1
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s0, s0, 16
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s3
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s34
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret <72 x i32> %val
 }
@@ -3061,12 +3040,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    s_mov_b32 s36, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3fe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
+; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
@@ -3082,7 +3062,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v32, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
@@ -3159,27 +3139,27 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX10-NEXT:    s_mov_b32 s35, return_72xi32 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, return_72xi32 at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v32, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_clause 0x28
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:636
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:648
-; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:652
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:656
-; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:660
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:664
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:668
-; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:672
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:676
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:680
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:684
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:688
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:692
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:696
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:700
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:704
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:644
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:648
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:652
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:656
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:660
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:664
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:668
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:672
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:676
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:680
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:684
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:688
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:692
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:696
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:700
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:704
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:708
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:712
 ; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:716
@@ -3253,22 +3233,22 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
 ; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:28
-; GFX10-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76
 ; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80
@@ -3304,7 +3284,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    s_clause 0xe
+; GFX10-NEXT:    s_clause 0xf
 ; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8
@@ -3320,10 +3300,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60
+; GFX10-NEXT:    v_readlane_b32 s31, v32, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffec000
@@ -3334,12 +3315,12 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s45, s33
+; GFX11-NEXT:    s_mov_b32 s35, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v32, s33 offset:1536 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v33, s33 offset:1536 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
@@ -3349,15 +3330,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:60
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v46, s33 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v47, s33 offset:32
+; GFX11-NEXT:    s_clause 0xd
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:52
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:48
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:44
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:40
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:36
+; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v56, s33 offset:28
 ; GFX11-NEXT:    scratch_store_b32 off, v57, s33 offset:24
 ; GFX11-NEXT:    scratch_store_b32 off, v58, s33 offset:20
@@ -3388,7 +3367,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v32, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v33, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
@@ -3405,110 +3384,115 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT:    s_mov_b32 s47, return_72xi32 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s46, return_72xi32 at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[46:47]
-; GFX11-NEXT:    s_clause 0xb
-; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:624
-; GFX11-NEXT:    scratch_load_b128 v[26:29], off, s33 offset:640
-; GFX11-NEXT:    scratch_load_b128 v[48:51], off, s33 offset:656
-; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:672
-; GFX11-NEXT:    scratch_load_b128 v[40:43], off, s33 offset:688
-; GFX11-NEXT:    scratch_load_b128 v[44:47], off, s33 offset:704
-; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:720
-; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:736
+; GFX11-NEXT:    s_mov_b32 s37, return_72xi32 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s36, return_72xi32 at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:624
+; GFX11-NEXT:    scratch_load_b128 v[34:37], off, s33 offset:640
+; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s33 offset:1540 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v34
+; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:656
+; GFX11-NEXT:    scratch_load_b128 v[38:41], off, s33 offset:672
+; GFX11-NEXT:    scratch_load_b128 v[42:45], off, s33 offset:688
+; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:704
+; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:720
+; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:736
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
 ; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:768
 ; GFX11-NEXT:    scratch_load_b128 v[8:11], off, s33 offset:784
-; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:512
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:512
+; GFX11-NEXT:    v_dual_mov_b32 v50, v35 :: v_dual_mov_b32 v51, v36
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_dual_mov_b32 v31, v50 :: v_dual_mov_b32 v30, v49
+; GFX11-NEXT:    v_dual_mov_b32 v34, v37 :: v_dual_mov_b32 v35, v52
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_mov_b32_e32 v52, v41
+; GFX11-NEXT:    v_dual_mov_b32 v36, v53 :: v_dual_mov_b32 v37, v54
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_dual_mov_b32 v49, v40 :: v_dual_mov_b32 v50, v41
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_dual_mov_b32 v41, v56 :: v_dual_mov_b32 v40, v47
+; GFX11-NEXT:    v_dual_mov_b32 v53, v42 :: v_dual_mov_b32 v54, v43
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_dual_mov_b32 v47, v2 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v27
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1604 ; 16-byte Folded Spill
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:528
-; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:544
-; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:560
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:576
-; GFX11-NEXT:    v_dual_mov_b32 v39, v28 :: v_dual_mov_b32 v28, v29
-; GFX11-NEXT:    v_dual_mov_b32 v29, v48 :: v_dual_mov_b32 v48, v55
-; GFX11-NEXT:    v_dual_mov_b32 v55, v46 :: v_dual_mov_b32 v46, v1
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, v7
-; GFX11-NEXT:    v_mov_b32_e32 v5, v8
-; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v56, v59
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_mov_b32_e32 v8, v15
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v17 :: v_dual_mov_b32 v15, v22
+; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:528
+; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
+; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:560
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
+; GFX11-NEXT:    v_dual_mov_b32 v42, v59 :: v_dual_mov_b32 v43, v60
+; GFX11-NEXT:    v_dual_mov_b32 v41, v58 :: v_dual_mov_b32 v58, v13
+; GFX11-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v21 :: v_dual_mov_b32 v9, v20
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1572 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:592
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1588 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:592
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1556 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:608
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:608
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1540 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s32
-; GFX11-NEXT:    v_dual_mov_b32 v37, v52 :: v_dual_mov_b32 v38, v53
-; GFX11-NEXT:    v_mov_b32_e32 v39, v54
-; GFX11-NEXT:    v_dual_mov_b32 v53, v44 :: v_dual_mov_b32 v54, v45
-; GFX11-NEXT:    v_dual_mov_b32 v44, v63 :: v_dual_mov_b32 v45, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v9
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s32
+; GFX11-NEXT:    v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v38
+; GFX11-NEXT:    v_dual_mov_b32 v50, v39 :: v_dual_mov_b32 v51, v40
+; GFX11-NEXT:    v_dual_mov_b32 v38, v45 :: v_dual_mov_b32 v39, v56
+; GFX11-NEXT:    v_mov_b32_e32 v40, v57
+; GFX11-NEXT:    v_dual_mov_b32 v56, v63 :: v_dual_mov_b32 v57, v12
+; GFX11-NEXT:    v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v15, v2
+; GFX11-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v5, v8
 ; GFX11-NEXT:    scratch_store_b32 off, v11, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
-; GFX11-NEXT:    v_dual_mov_b32 v36, v51 :: v_dual_mov_b32 v51, v42
-; GFX11-NEXT:    v_mov_b32_e32 v52, v43
+; GFX11-NEXT:    v_dual_mov_b32 v55, v44 :: v_dual_mov_b32 v44, v61
+; GFX11-NEXT:    v_mov_b32_e32 v45, v62
 ; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
-; GFX11-NEXT:    v_mov_b32_e32 v42, v57
+; GFX11-NEXT:    v_mov_b32_e32 v7, v18
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v5, v12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 24
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
-; GFX11-NEXT:    v_mov_b32_e32 v43, v58
-; GFX11-NEXT:    v_dual_mov_b32 v57, v60 :: v_dual_mov_b32 v58, v61
-; GFX11-NEXT:    scratch_store_b128 off, v[44:47], s0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v16
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
-; GFX11-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v7, v14
+; GFX11-NEXT:    v_mov_b32_e32 v6, v17
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
-; GFX11-NEXT:    v_mov_b32_e32 v9, v16
+; GFX11-NEXT:    v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v11, v22
 ; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
-; GFX11-NEXT:    v_mov_b32_e32 v11, v18
-; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
+; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v15, v26
+; GFX11-NEXT:    scratch_store_b128 off, v[42:45], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    v_dual_mov_b32 v12, v19 :: v_dual_mov_b32 v13, v20
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v24
+; GFX11-NEXT:    scratch_store_b128 off, v[38:41], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 48
-; GFX11-NEXT:    v_mov_b32_e32 v14, v21
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v25
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
-; GFX11-NEXT:    v_mov_b32_e32 v16, v23
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
+; GFX11-NEXT:    v_mov_b32_e32 v16, v27
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s0
-; GFX11-NEXT:    v_mov_b32_e32 v29, v33
+; GFX11-NEXT:    scratch_store_b128 off, v[34:37], s0
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1604 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588
-; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
-; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
-; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
+; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1588
+; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1572
+; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1556
+; GFX11-NEXT:    scratch_load_b128 v[29:32], off, s33 offset:1540
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
-; GFX11-NEXT:    v_dual_mov_b32 v30, v34 :: v_dual_mov_b32 v31, v35
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 42
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[46:47]
-; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX11-NEXT:    s_clause 0xd
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s33 offset:8
@@ -3517,21 +3501,19 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:24
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s33 offset:28
-; GFX11-NEXT:    scratch_load_b32 v47, off, s33 offset:32
-; GFX11-NEXT:    scratch_load_b32 v46, off, s33 offset:36
-; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:40
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:44
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:48
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:52
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:56
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:60
-; GFX11-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:32
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:36
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:40
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:44
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:48
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:52
+; GFX11-NEXT:    v_readlane_b32 s31, v33, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v33, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s33 offset:1536 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v33, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s45
+; GFX11-NEXT:    s_mov_b32 s33, s35
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:

>From 88e8b5d2b53c419f6f9e115c6a7f71f6efdcb94e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 9 Nov 2023 12:03:06 -0800
Subject: [PATCH 2/2] [AMDGPU] Prefer lower total register usage in regions
 with spilling

Change-Id: Ide424060a23d3a90d32c9456e58dd07f9049d954
---
 .../Target/AMDGPU/GCNIterativeScheduler.cpp   |  18 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     |  38 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |   6 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |   9 +-
 .../CodeGen/AMDGPU/spill-regpressure-less.mir | 353 ++++++++++++++++++
 5 files changed, 409 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index d89c9b1febded0f..d76d5bd20618892 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -410,8 +410,10 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 // Sort recorded regions by pressure - highest at the front
 void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
   const auto &ST = MF.getSubtarget<GCNSubtarget>();
+
   llvm::sort(Regions, [&ST, TargetOcc](const Region *R1, const Region *R2) {
-    return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+    return R2->MaxPressure.less(ST, R1->MaxPressure, false, false, false, false,
+                                TargetOcc);
   });
 }
 
@@ -524,19 +526,21 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
 
   auto MaxPressure = Regions.front()->MaxPressure;
   for (auto *R : Regions) {
-    if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+    if (!force && R->MaxPressure.less(ST, MaxPressure, false, false, false,
+                                      false, TgtOcc))
       break;
 
     BuildDAG DAG(*R, *this);
     const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
 
     const auto RP = getSchedulePressure(*R, MinSchedule);
-    LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
-      dbgs() << "\nWarning: Pressure becomes worse after minreg!";
-      printSchedRP(dbgs(), R->MaxPressure, RP);
-    });
+    LLVM_DEBUG(
+        if (R->MaxPressure.less(ST, RP, false, false, false, false, TgtOcc)) {
+          dbgs() << "\nWarning: Pressure becomes worse after minreg!";
+          printSchedRP(dbgs(), R->MaxPressure, RP);
+        });
 
-    if (!force && MaxPressure.less(ST, RP, TgtOcc))
+    if (!force && MaxPressure.less(ST, RP, false, false, false, false, TgtOcc))
       break;
 
     scheduleRegion(*R, MinSchedule, RP);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index a04c470b7b9762f..88ffa74eab2353c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -88,8 +88,9 @@ void GCNRegPressure::inc(unsigned Reg,
   }
 }
 
-bool GCNRegPressure::less(const GCNSubtarget &ST,
-                          const GCNRegPressure& O,
+bool GCNRegPressure::less(const GCNSubtarget &ST, const GCNRegPressure &O,
+                          bool OtherExcessSGPR, bool OtherExcessVGPR,
+                          bool ExcessSGPR, bool ExcessVGPR,
                           unsigned MaxOccupancy) const {
   const auto SGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(getSGPRNum()));
@@ -104,18 +105,45 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+
+  bool ExcessRP = ExcessSGPR || ExcessVGPR;
+  bool OtherExcessRP = OtherExcessSGPR || OtherExcessVGPR;
+  // In regions with spilling, we give precedence to the schedule with
+  // lower general RP.
+  if (ExcessRP || OtherExcessRP) {
+    // If the current RP doesn't have excess, then the other must have excess.
+    // Current RP is better.
+    if (!ExcessRP)
+      return true;
+    // If the other didn't have excess, but the current does, then current RP is
+    // worse.
+    else if (!OtherExcessRP)
+      return false;
+    else {
+      bool SGPRImportant = OtherExcessSGPR && ExcessSGPR;
+      unsigned GPRPressure =
+          SGPRImportant ? getSGPRNum() : getVGPRNum(ST.hasGFX90AInsts());
+      unsigned OtherGPRPressure =
+          SGPRImportant ? O.getSGPRNum() : O.getVGPRNum(ST.hasGFX90AInsts());
+      // If the pressures are the same, fall through to the subsequent checks
+      if (GPRPressure != OtherGPRPressure)
+        return GPRPressure < OtherGPRPressure;
+    }
+  }
+
+  // Give first precedence to the better occupancy.
   if (Occ != OtherOcc)
     return Occ > OtherOcc;
 
   bool SGPRImportant = SGPROcc < VGPROcc;
   const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
 
-  // if both pressures disagree on what is more important compare vgprs
+  // If both pressures disagree on what is more important compare vgprs.
   if (SGPRImportant != OtherSGPRImportant) {
     SGPRImportant = false;
   }
 
-  // compare large regs pressure
+  // Give second precedence to lower register tuple pressure.
   bool SGPRFirst = SGPRImportant;
   for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
     if (SGPRFirst) {
@@ -130,6 +158,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
         return VW < OtherVW;
     }
   }
+
+  // Give final precedence to lower general RP.
   return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
                          (getVGPRNum(ST.hasGFX90AInsts()) <
                           O.getVGPRNum(ST.hasGFX90AInsts()));
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index c750fe74749e2b3..1f5ee858ecbc0ca 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -74,8 +74,10 @@ struct GCNRegPressure {
     return getOccupancy(ST) > O.getOccupancy(ST);
   }
 
-  bool less(const GCNSubtarget &ST, const GCNRegPressure& O,
-    unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+  bool less(const GCNSubtarget &ST, const GCNRegPressure &O,
+            bool ExcessSGPRBefore, bool ExcessVGPRBefore, bool ExcessSGPRAfter,
+            bool ExcessVGPRAfter,
+            unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
 
   bool operator==(const GCNRegPressure &O) const {
     return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 7e44b970c690d15..c5efe7af2fa1ed4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1213,10 +1213,15 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 }
 
 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
+
   if (WavesAfter <= MFI.getMinWavesPerEU() &&
-      !PressureAfter.less(ST, PressureBefore) &&
       (DAG.RegionsWithExcessSGPRAfter[RegionIdx] ||
-       DAG.RegionsWithExcessVGPRAfter[RegionIdx])) {
+       DAG.RegionsWithExcessVGPRAfter[RegionIdx]) &&
+      !PressureAfter.less(ST, PressureBefore,
+                          DAG.RegionsWithExcessSGPRBefore[RegionIdx],
+                          DAG.RegionsWithExcessVGPRBefore[RegionIdx],
+                          DAG.RegionsWithExcessSGPRAfter[RegionIdx],
+                          DAG.RegionsWithExcessVGPRAfter[RegionIdx])) {
     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     return true;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
new file mode 100644
index 000000000000000..f50688240fe8bd1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
@@ -0,0 +1,353 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @spill_regpressure_less() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+...
+
+---
+name:            spill_regpressure_less
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       8
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: spill_regpressure_less
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF40:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF41:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF42:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF43:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF44:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF45:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF46:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF47:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF48:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF49:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF50:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF51:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF52:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF53:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF54:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF55:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF56:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF57:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF58:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF59:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF60:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF61:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF62:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF63:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF64:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF65:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF66:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]], implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]], implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]], implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]]
+    ; GCN-NEXT: KILL [[DEF]]
+    ; GCN-NEXT: KILL [[DEF1]]
+    ; GCN-NEXT: KILL [[DEF10]]
+    ; GCN-NEXT: KILL [[DEF12]]
+    ; GCN-NEXT: KILL [[DEF13]]
+    ; GCN-NEXT: KILL [[DEF14]]
+    ; GCN-NEXT: KILL [[DEF15]]
+    ; GCN-NEXT: KILL [[DEF16]]
+    ; GCN-NEXT: [[DEF67:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF17]]
+    ; GCN-NEXT: [[DEF68:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF69:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF69]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]]
+    ; GCN-NEXT: KILL [[DEF2]]
+    ; GCN-NEXT: KILL [[DEF3]]
+    ; GCN-NEXT: KILL [[DEF4]]
+    ; GCN-NEXT: KILL [[DEF5]]
+    ; GCN-NEXT: KILL [[DEF6]]
+    ; GCN-NEXT: KILL [[DEF7]]
+    ; GCN-NEXT: KILL [[DEF8]]
+    ; GCN-NEXT: KILL [[DEF9]]
+    ; GCN-NEXT: KILL [[DEF18]]
+    ; GCN-NEXT: KILL [[DEF19]]
+    ; GCN-NEXT: [[DEF70:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF70]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
+    ; GCN-NEXT: KILL [[DEF69]], implicit-def %70, implicit-def %71, implicit-def %72, implicit-def %73, implicit-def %74, implicit-def %75, implicit-def %76, implicit-def %77
+    ; GCN-NEXT: [[DEF71:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF72:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF20]]
+    ; GCN-NEXT: [[DEF73:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF11]]
+    ; GCN-NEXT: [[DEF74:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF21]]
+    ; GCN-NEXT: [[DEF75:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF22]]
+    ; GCN-NEXT: [[DEF76:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF23]]
+    ; GCN-NEXT: KILL [[DEF24]]
+    ; GCN-NEXT: KILL [[DEF25]]
+    ; GCN-NEXT: KILL [[DEF26]]
+    ; GCN-NEXT: KILL [[DEF27]]
+    ; GCN-NEXT: KILL [[DEF28]]
+    ; GCN-NEXT: KILL [[DEF29]]
+    ; GCN-NEXT: KILL [[DEF30]]
+    ; GCN-NEXT: KILL [[DEF31]]
+    ; GCN-NEXT: KILL [[DEF32]]
+    ; GCN-NEXT: KILL [[DEF33]]
+    ; GCN-NEXT: KILL [[DEF34]]
+    ; GCN-NEXT: KILL [[DEF35]]
+    ; GCN-NEXT: KILL [[DEF36]]
+    ; GCN-NEXT: KILL [[DEF37]]
+    ; GCN-NEXT: KILL [[DEF38]]
+    ; GCN-NEXT: KILL [[DEF39]]
+    ; GCN-NEXT: KILL [[DEF40]]
+    ; GCN-NEXT: KILL [[DEF41]]
+    ; GCN-NEXT: KILL [[DEF42]]
+    ; GCN-NEXT: KILL [[DEF43]]
+    ; GCN-NEXT: KILL [[DEF44]]
+    ; GCN-NEXT: KILL [[DEF45]]
+    ; GCN-NEXT: KILL [[DEF46]]
+    ; GCN-NEXT: KILL [[DEF47]]
+    ; GCN-NEXT: KILL [[DEF48]]
+    ; GCN-NEXT: KILL [[DEF49]]
+    ; GCN-NEXT: KILL [[DEF50]]
+    ; GCN-NEXT: KILL [[DEF51]]
+    ; GCN-NEXT: KILL [[DEF52]]
+    ; GCN-NEXT: KILL [[DEF53]]
+    ; GCN-NEXT: KILL [[DEF54]]
+    ; GCN-NEXT: KILL [[DEF55]]
+    ; GCN-NEXT: KILL [[DEF56]]
+    ; GCN-NEXT: KILL [[DEF57]]
+    ; GCN-NEXT: KILL [[DEF58]]
+    ; GCN-NEXT: KILL [[DEF59]]
+    ; GCN-NEXT: KILL [[DEF60]]
+    ; GCN-NEXT: KILL [[DEF61]]
+    ; GCN-NEXT: KILL [[DEF62]]
+    ; GCN-NEXT: KILL [[DEF63]]
+    ; GCN-NEXT: KILL [[DEF64]]
+    ; GCN-NEXT: KILL [[DEF65]]
+    ; GCN-NEXT: KILL [[DEF66]]
+    ; GCN-NEXT: KILL [[DEF67]]
+    ; GCN-NEXT: KILL [[DEF68]]
+    ; GCN-NEXT: KILL [[DEF71]]
+    ; GCN-NEXT: KILL [[DEF72]]
+    ; GCN-NEXT: KILL [[DEF73]]
+    ; GCN-NEXT: KILL [[DEF74]]
+    ; GCN-NEXT: KILL [[DEF75]]
+    ; GCN-NEXT: KILL [[DEF76]]
+    ; GCN-NEXT: KILL [[DEF70]]
+    ; GCN-NEXT: KILL %70
+    ; GCN-NEXT: KILL %71
+    ; GCN-NEXT: KILL %72
+    ; GCN-NEXT: KILL %73
+    ; GCN-NEXT: KILL %74
+    ; GCN-NEXT: KILL %75
+    ; GCN-NEXT: KILL %76
+    ; GCN-NEXT: KILL %77
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:vgpr_32 = IMPLICIT_DEF
+    %6:vgpr_32 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %8:vgpr_32 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %43:vgpr_32 = IMPLICIT_DEF
+    %44:vgpr_32 = IMPLICIT_DEF
+    %45:vgpr_32 = IMPLICIT_DEF
+    %46:vgpr_32 = IMPLICIT_DEF
+    %47:vgpr_32 = IMPLICIT_DEF
+    %48:vgpr_32 = IMPLICIT_DEF
+    %49:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = IMPLICIT_DEF
+    %51:vgpr_32 = IMPLICIT_DEF
+    %52:vgpr_32 = IMPLICIT_DEF
+    %53:vgpr_32 = IMPLICIT_DEF
+    %54:vgpr_32 = IMPLICIT_DEF
+    %55:vgpr_32 = IMPLICIT_DEF
+    %56:vgpr_32 = IMPLICIT_DEF
+    %57:vgpr_32 = IMPLICIT_DEF
+    %58:vgpr_32 = IMPLICIT_DEF
+    %59:vgpr_32 = IMPLICIT_DEF
+    %60:vgpr_32 = IMPLICIT_DEF
+    %61:vgpr_32 = IMPLICIT_DEF
+    %62:vgpr_32 = IMPLICIT_DEF
+    %63:vgpr_32 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %65:vgpr_32 = IMPLICIT_DEF
+    %66:vgpr_32 = IMPLICIT_DEF
+    %67:vgpr_32 = IMPLICIT_DEF
+    %68:vgpr_32 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31, implicit %32, implicit %33, implicit %34, implicit %35, implicit %36, implicit %37, implicit %38, implicit %39, implicit %40, implicit %41, implicit %42, implicit %43, implicit %44, implicit %45, implicit %46, implicit %47, implicit %48, implicit %49, implicit %50, implicit %51, implicit %52, implicit %53, implicit %54, implicit %55, implicit %56, implicit %57, implicit %58, implicit %59, implicit %60, implicit %61, implicit %62, implicit %63, implicit %64, implicit %65, implicit %66
+    %69:sgpr_128 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %69, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28
+    KILL %0
+    KILL %1
+    KILL %2
+    KILL %3
+    KILL %4
+    KILL %5
+    KILL %6
+    KILL %7
+    KILL %8
+    KILL %9
+    KILL %10
+    KILL %12
+    KILL %13
+    KILL %14
+    KILL %15
+    KILL %16
+    KILL %17
+    KILL %18
+    KILL %19
+    KILL %69:sgpr_128, implicit-def %77:vgpr_32, implicit-def %78:vgpr_32, implicit-def %79:vgpr_32, implicit-def %80:vgpr_32, implicit-def %81:vgpr_32, implicit-def %82:vgpr_32, implicit-def %83:vgpr_32, implicit-def %84:vgpr_32
+    %70:vgpr_32 = IMPLICIT_DEF
+    %71:vgpr_32 = IMPLICIT_DEF
+    %72:vgpr_32 = IMPLICIT_DEF
+    %73:vgpr_32 = IMPLICIT_DEF
+    %74:vgpr_32 = IMPLICIT_DEF
+    %75:vgpr_32 = IMPLICIT_DEF
+    %76:sgpr_128 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %76, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+    KILL %20
+    KILL %11
+    KILL %21
+    KILL %22
+    KILL %23
+    KILL %24
+    KILL %25
+    KILL %26
+    KILL %27
+    KILL %28
+    KILL %29
+    KILL %30
+    KILL %31
+    KILL %32
+    KILL %33
+    KILL %34
+    KILL %35
+    KILL %36
+    KILL %37
+    KILL %38
+    KILL %39
+    KILL %40
+    KILL %41
+    KILL %42
+    KILL %43
+    KILL %44
+    KILL %45
+    KILL %46
+    KILL %47
+    KILL %48
+    KILL %49
+    KILL %50
+    KILL %51
+    KILL %52
+    KILL %53
+    KILL %54
+    KILL %55
+    KILL %56
+    KILL %57
+    KILL %58
+    KILL %59
+    KILL %60
+    KILL %61
+    KILL %62
+    KILL %63
+    KILL %64
+    KILL %65
+    KILL %66
+    KILL %67
+    KILL %68
+    KILL %70
+    KILL %71
+    KILL %72
+    KILL %73
+    KILL %74
+    KILL %75
+    KILL %76
+    KILL %77
+    KILL %78
+    KILL %79
+    KILL %80
+    KILL %81
+    KILL %82
+    KILL %83
+    KILL %84
+...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GCN: {{.*}}



More information about the llvm-commits mailing list